def evaluate_model_and_save(model_file): analysis = evaluate_model(model_file) print(analysis) filename = model_file.split(".")[0] + '_analysis.csv' full_path = GPATH / MODELS_PATH / filename analysis.to_csv(full_path) logging.info(f"Saved model analysis to {full_path}")
def report_class_counts(): df = get_dataframe().drop(['filenames'], axis=1) class_counts = df.sum() filename = get_timestring() + "_class_counts.csv" full_path = GPATH / REPORTS_PATH / filename class_counts.to_csv(full_path) logging.info(f"Saved class count report to {full_path}") print(f"Report saved to {full_path}")
def report_ALL_keyword_counts(): df = get_raw_dataframe() k = df['keywords'] k = k.str.replace(' ', '_') k = k.str.replace(',', ' ') cv = CountVectorizer() data_cv = cv.fit_transform(k) data_dtm = pd.DataFrame(data_cv.toarray(), columns=cv.get_feature_names()) data_dtm.index = k.index summed = data_dtm[0:50000].sum() summed2 = data_dtm[51000:].sum() summed_all = summed + summed2 sorted_df = summed_all.sort_values(ascending=False) top_1000 = sorted_df.head(1000) filename = get_timestring() + "_top_1k_keywords.csv" full_path = GPATH / REPORTS_PATH / filename top_1000.to_csv(full_path) logging.info( f"Saved report of ALL keywords counts from data in the csvs_ready directory to {full_path}" ) print(f"Report saved to {full_path}")
def watching_stories(domain_list): """ watching stories of competitors :param domain_list: targeted competitors domain names :return: """ fetch = fetch_time_line() db_category_list = __category_service.find_all_categories() db_interest_list = __interest_service.find_all_interests() for domain in domain_list: r = requests.get("https://api.newswhip.com/v1/publisher/" + domain + "/1?key="+newswhip_key) response = json.loads(r.text) logging.info("Domain: " + domain + " & No of Articles: " + str(len(response['articles']))) for item in response['articles']: try: article_info = domparser.element_picker(item['link'].encode('utf-8')) if article_info['title'] is not None or article_info['feature_image'] is not None or article_info['url'] is not None: article = {'title': '', 'url': '', 'description': '', 'keywords': '', 'feature_image': '','New_score': '', 'max_new_score': '', 'fb_like': '', 'tweet_count': '', 'publisher': '', "uuid": '', 'published': '', 'category': [], 'interest': [], 'fetch': '', 'created_keys':[]} if item['headline'] is None: article['title'] = article_info['title'] else: article['title'] = item['headline'].encode('utf-8') if item['link'] is None: article['url'] = article_info['url'] else: article['url'] = item['link'].encode('utf-8') if item['excerpt'] is None: article['description'] = article_info['description'] else: article['description'] = item['excerpt'] if item['keywords'] is None: article['keywords'] = article_info['keywords'] else: article['keywords'] = (item['keywords']).split(',') if item['image_link'] is None: article['feature_image'] = article_info['feature_image'] else: article['feature_image'] = item['image_link'] if 'new_score' in item: article['New_score'] = item['nw_score'] else: article['New_score'] = 0 if 'max_new_score' in item: article['max_new_score'] = item['max_nw_score'] else: article['max_new_score'] = 0 if 'total_engagement_count' in item['fb_data']: article['fb_like'] = item['fb_data']['total_engagement_count'] else: article['fb_like'] = 0 if 'tw_count' in item['tw_data']: article['tweet_count'] = item['tw_data']['tw_count'] else: item['tw_data']['tw_count'] = 0 if 'publisher' in item['source']: article['publisher'] = item['source']['publisher'] else: article['publisher'] = "None" if 'uuid' in item: article['uuid'] = item['uuid'] else: article['uuid'] = 'None' if 'publication_timestamp' in item: article['published'] = time.strftime('%Y-%m-%d %H:%M', time.localtime(item['publication_timestamp']/1000.0)) else: article['published'] = "None" article['fetch'] = current_epoch_time(datetime.now()) dummy_category = [] for i in article_info['category']: split_list = i.split(',') for itr in split_list: if itr not in dummy_category: dummy_category.append(itr.lower()) article_info['category'] = dummy_category if not any(category['category'] in article_info['category'] for category in db_category_list): for category_item in article_info['category']: for interest in db_interest_list: if category_item == interest['interest']: if category_item not in article['interest']: article['interest'].append(category_item) if len(article['interest']) <= 0: article['category'] = article_info['category'] else: article['category'] = [] for int_item in article['interest']: current_interest = filter(lambda member:int_item == member['interest'], db_interest_list) if len(current_interest) == 1: current_category = filter(lambda member: current_interest[0]['category_id'] == member['_id'], db_category_list) if len(current_category) == 1: article['category'].append(current_category[0]['category']) else: if article['keywords'] is not None: (article['interest'], return_category_ids) = checking_interest(article['keywords']) article['category'] = article_info['category'] key_phrases_list = [] raw_key_phrases_list = [] interest_category_id = [] if article_info['keywords']: keywords_key_phrases = (''.join(map(str, ((article_info['keywords'][0]).decode('ascii', 'ignore')).lower()))).split(",") key_phrases_list += keywords_key_phrases raw_key_phrases_list += keywords_key_phrases if article_info['title']: title_key_phrases = extractKeyphrases(article_info['title'].decode('ascii', 'ignore')) key_phrases_list += list(title_key_phrases) raw_key_phrases_list.append(str(article_info['title'].decode('ascii', 'ignore'))) if article_info['description']: description_key_phrases = extractKeyphrases(article_info['description'].decode('ascii', 'ignore')) key_phrases_list += list(description_key_phrases) raw_key_phrases_list.append(str(article_info['description'].decode('ascii', 'ignore'))) d = Counter(key_phrases_list) keys_to_remove = ['', ' ', '%', 'an', 'a', ',', 'ii', 'r', 'so', 'is', 'in', 'the', 'nbt', 'us', 'them', 's', '|', 'eisamay', 'navbharat', '-navbharat', 'navbharat times', 'samay', 'india'] refactor_key_list = [] for key in list(d.keys()): if (key.strip()).lower() not in keys_to_remove and (key.strip()).lower() not in refactor_key_list: refactor_key_list.append((key.strip()).lower()) article['created_keys'] = refactor_key_list if article['created_keys'] is not None: (created_interest, interest_category_id) = checking_interest(raw_key_phrases_list) if created_interest is not None: article['interest'] += created_interest if interest_category_id is not None: cat_dict = Counter(interest_category_id) top_order_category = '' top = 0 for index, cat_item in enumerate(cat_dict.keys()): if cat_dict[cat_dict.keys()[index]] >= top: top_order_category = cat_dict.keys()[index] top = cat_dict[cat_dict.keys()[index]] if top_order_category: supposed_category = __category_service.find_category(top_order_category) article['category'].append(supposed_category['category']) if article['interest']: article['status'] = True else: article['status'] = False __story_service.save_story(article) __fetch_service.save_fetch(fetch) except Exception as ex: logging.info("Runtime Error: " + ex)
articles = self.__story_service.find_latest_stories() for article in articles: if article["_id"] != p_article["_id"]: article_cosine_smlr_score = self.get_cosine_similarity(p_article["created_keys"], article["created_keys"]) similar_category_score = self.get_smlr_category_score(p_article["category"], article["category"]) article_cosine_smlr_score = article_cosine_smlr_score + similar_category_score if article_cosine_smlr_score >= 0.5: article["group"] = "group" + str(gp_count) self.__story_service.update_story(article) if __name__ == "__main__": while True: logging.info("Scheduler initialize....") logging.info("Start time: " + str(datetime.now())) __story_service = StoryService() __category_service = CategoryService() __interest_service = InterestService() __fetch_service = FetchService() with open('credentials.json', 'r') as credential_file: data = json.load(credential_file) newswhip_key = data["key"] competitors = data["competitors"] watching_stories(competitors) stories = __story_service.find_latest_stories() if len(stories) > 100: logging.info("Grouping initialize....") __story_service.reset_group() gp_count = 0
"query": (trend["query"]).replace('%23', '#').replace('%22', '"').encode('utf-8'), "location": locations[idx], "name": trend["name"].encode('utf-8') } __twitter_service.save_twitter(new_dict) get_top_tweets(new_dict['query'], new_dict['location']) if __name__ == "__main__": while True: logging.info("Twitter initialize....") __twitter_service = TwitterService() __tweet_service = TweetService() with open('credentials.json', 'r') as credential_file: json_obj = json.load(credential_file) locations = json_obj['twitter']['location'] locationIDs = json_obj['twitter']['locationID'] consumer_key = json_obj['twitter']['app'][0]['TW_CONSUMER_KEY'] consumer_secret = json_obj['twitter']['app'][0][ 'TW_CONSUMER_SECRET'] access_token = json_obj['twitter']['app'][0]['TW_ACCESS_TOKEN'] access_token_secret = json_obj['twitter']['app'][0][ 'TW_ACC_TOKEN_SECRET'] auth = tweepy.OAuthHandler(consumer_key, consumer_secret) auth.set_access_token(access_token, access_token_secret)
return RestResponse(all_group).to_json() @route('/group/<group_id>') def get_trending_group(group_id): group_stories = __story_service.get_group_stories(group_id) return RestResponse(group_stories).to_json() @route('/twitter') def location_trending(): trending_hashtags = __twitter_service.get_location_trending() return RestResponse(trending_hashtags).to_json() @route('/tweets') def get_tweets(): top_tweets = __tweet_service.get_tweets() return RestResponse(top_tweets).to_json() if __name__ == "__main__": __user_service = UserService() __interest_service = InterestService() __category_service = CategoryService() __story_service = StoryService() __twitter_service = TwitterService() __tweet_service = TweetService() run(host='0.0.0.0', port=8889, server='waitress') logging.info("server running....")
def watching_stories(domain_list): """ watching stories of competitors :param domain_list: targeted competitors domain names :return: """ fetch = fetch_time_line() db_category_list = __category_service.find_all_categories() db_interest_list = __interest_service.find_all_interests() for domain in domain_list: r = requests.get("https://api.newswhip.com/v1/publisher/" + domain + "/1?key=" + newswhip_key) response = json.loads(r.text) logging.info("Domain: " + domain + " & No of Articles: " + str(len(response['articles']))) for item in response['articles']: try: article_info = domparser.element_picker( item['link'].encode('utf-8')) if article_info['title'] is not None or article_info[ 'feature_image'] is not None or article_info[ 'url'] is not None: article = { 'title': '', 'url': '', 'description': '', 'keywords': '', 'feature_image': '', 'New_score': '', 'max_new_score': '', 'fb_like': '', 'tweet_count': '', 'publisher': '', "uuid": '', 'published': '', 'category': [], 'interest': [], 'fetch': '', 'created_keys': [] } if item['headline'] is None: article['title'] = article_info['title'] else: article['title'] = item['headline'].encode('utf-8') if item['link'] is None: article['url'] = article_info['url'] else: article['url'] = item['link'].encode('utf-8') if item['excerpt'] is None: article['description'] = article_info['description'] else: article['description'] = item['excerpt'] if item['keywords'] is None: article['keywords'] = article_info['keywords'] else: article['keywords'] = (item['keywords']).split(',') if item['image_link'] is None: article['feature_image'] = article_info[ 'feature_image'] else: article['feature_image'] = item['image_link'] if 'new_score' in item: article['New_score'] = item['nw_score'] else: article['New_score'] = 0 if 'max_new_score' in item: article['max_new_score'] = item['max_nw_score'] else: article['max_new_score'] = 0 if 'total_engagement_count' in item['fb_data']: article['fb_like'] = item['fb_data'][ 'total_engagement_count'] else: article['fb_like'] = 0 if 'tw_count' in item['tw_data']: article['tweet_count'] = item['tw_data']['tw_count'] else: item['tw_data']['tw_count'] = 0 if 'publisher' in item['source']: article['publisher'] = item['source']['publisher'] else: article['publisher'] = "None" if 'uuid' in item: article['uuid'] = item['uuid'] else: article['uuid'] = 'None' if 'publication_timestamp' in item: article['published'] = time.strftime( '%Y-%m-%d %H:%M', time.localtime(item['publication_timestamp'] / 1000.0)) else: article['published'] = "None" article['fetch'] = current_epoch_time(datetime.now()) dummy_category = [] for i in article_info['category']: split_list = i.split(',') for itr in split_list: if itr not in dummy_category: dummy_category.append(itr.lower()) article_info['category'] = dummy_category if not any(category['category'] in article_info['category'] for category in db_category_list): for category_item in article_info['category']: for interest in db_interest_list: if category_item == interest['interest']: if category_item not in article[ 'interest']: article['interest'].append( category_item) if len(article['interest']) <= 0: article['category'] = article_info['category'] else: article['category'] = [] for int_item in article['interest']: current_interest = filter( lambda member: int_item == member[ 'interest'], db_interest_list) if len(current_interest) == 1: current_category = filter( lambda member: current_interest[0][ 'category_id'] == member['_id'], db_category_list) if len(current_category) == 1: article['category'].append( current_category[0]['category']) else: if article['keywords'] is not None: (article['interest'], return_category_ids) = checking_interest( article['keywords']) article['category'] = article_info['category'] key_phrases_list = [] raw_key_phrases_list = [] interest_category_id = [] if article_info['keywords']: keywords_key_phrases = (''.join( map(str, ((article_info['keywords'][0]).decode( 'ascii', 'ignore')).lower()))).split(",") key_phrases_list += keywords_key_phrases raw_key_phrases_list += keywords_key_phrases if article_info['title']: title_key_phrases = extractKeyphrases( article_info['title'].decode('ascii', 'ignore')) key_phrases_list += list(title_key_phrases) raw_key_phrases_list.append( str(article_info['title'].decode( 'ascii', 'ignore'))) if article_info['description']: description_key_phrases = extractKeyphrases( article_info['description'].decode( 'ascii', 'ignore')) key_phrases_list += list(description_key_phrases) raw_key_phrases_list.append( str(article_info['description'].decode( 'ascii', 'ignore'))) d = Counter(key_phrases_list) keys_to_remove = [ '', ' ', '%', 'an', 'a', ',', 'ii', 'r', 'so', 'is', 'in', 'the', 'nbt', 'us', 'them', 's', '|', 'eisamay', 'navbharat', '-navbharat', 'navbharat times', 'samay', 'india' ] refactor_key_list = [] for key in list(d.keys()): if (key.strip()).lower() not in keys_to_remove and ( key.strip()).lower() not in refactor_key_list: refactor_key_list.append((key.strip()).lower()) article['created_keys'] = refactor_key_list if article['created_keys'] is not None: (created_interest, interest_category_id ) = checking_interest(raw_key_phrases_list) if created_interest is not None: article['interest'] += created_interest if interest_category_id is not None: cat_dict = Counter(interest_category_id) top_order_category = '' top = 0 for index, cat_item in enumerate(cat_dict.keys()): if cat_dict[cat_dict.keys()[index]] >= top: top_order_category = cat_dict.keys()[index] top = cat_dict[cat_dict.keys()[index]] if top_order_category: supposed_category = __category_service.find_category( top_order_category) article['category'].append( supposed_category['category']) if article['interest']: article['status'] = True else: article['status'] = False __story_service.save_story(article) __fetch_service.save_fetch(fetch) except Exception as ex: logging.info("Runtime Error: " + ex)
for article in articles: if article["_id"] != p_article["_id"]: article_cosine_smlr_score = self.get_cosine_similarity( p_article["created_keys"], article["created_keys"]) similar_category_score = self.get_smlr_category_score( p_article["category"], article["category"]) article_cosine_smlr_score = article_cosine_smlr_score + similar_category_score if article_cosine_smlr_score >= 0.5: article["group"] = "group" + str(gp_count) self.__story_service.update_story(article) if __name__ == "__main__": while True: logging.info("Scheduler initialize....") logging.info("Start time: " + str(datetime.now())) __story_service = StoryService() __category_service = CategoryService() __interest_service = InterestService() __fetch_service = FetchService() with open('credentials.json', 'r') as credential_file: data = json.load(credential_file) newswhip_key = data["key"] competitors = data["competitors"] watching_stories(competitors) stories = __story_service.find_latest_stories() if len(stories) > 100: logging.info("Grouping initialize....") __story_service.reset_group() gp_count = 0
from app.modules import * from app.config import logging logging.info("\nProgram started from here.") driver = connecting_with_whatsapp() # Fetching data from Excel/File: to_forward, pause_time = retrieve_file_parameter() pause_time = int(pause_time) employee = r"EPL" client = r"CLT" ss_path = os.path.join(os.getcwd(), "app", "screenshots") ss_path = ss_path + os.sep # Data Fetched... while True: logging.info("\nWhile Loop started here.\n") # Deleting already exsisting screenshots. delete_ss(ss_path) check_unread_from_these(driver, ss_path, employee, client) send_ss(driver, ss_path, to_forward) print("Successfully send the screenshots!!!...") sleep(randint(7, 10)) logging.info("\nWhile Loop ended here.\n") logging.info("\n\nWaiting for " + str(pause_time) + " minutes to re-run the bot.\n\n")
def clean_csv_data_and_images(): # Read in all csvs from the RAW_CSV_DIRECTORY and concat them into one dataframe files = glob.glob("{}/*.csv".format(GPATH / RAW_CSV_DIRECTORY)) logging.info("Globbing {} csvs for cleaning.".format(len(files))) df = pd.concat((pd.read_csv(f) for f in files)) logging.info("The df has shape {}.".format(df.shape)) df.set_index('id', inplace=True) # Drop duplicates (based on id) count_before = len(df.index) df = df[~df.index.duplicated(keep='first')] count_after = len(df.index) logging.info( f'Dropped {count_before-count_after} items with duplicate indices.') # Drop na df.dropna(inplace=True) # Keyword preprocessing k = df['keywords'] k = k.str.replace(' ', '_') k = k.str.replace(',', ' ') df.keywords = k # Get the filenames of the images cleanly without any paths filenames_list = [] for i, row in df.iterrows(): try: filename = eval(row.images)[0]['path'].split('/')[1] except: filename = "" filenames_list.append(filename) df['filename'] = filenames_list # Drop the images column df.drop(['images', 'searchTerm', 'pageNum', 'contributer', 'type'], axis=1, inplace=True) # Ensure the files really are there badIds = [] for i, row in df.iterrows(): fn = GPATH / RAW_IMAGE_DIRECTORY / 'full' / row.filename if not path.exists(fn): badIds.append(i) if len(badIds) > 0: logging.warning( f'Found {len(badIds)} ids with no image file. Dropping them from the cleaned df.' ) df.drop(badIds, inplace=True) else: logging.info('All files checked and exist while cleaning the data.') # Sort by index df.sort_index(inplace=True) print(df.info()) # Clean the images fix_all_with_threads(df) # Drop a few more columns df.drop(['image_urls', 'filename'], axis=1, inplace=True) # Save out a compiled and cleaned csv now = datetime.now() timeString = now.strftime("%Y%m%d%H%M%S%f") csv_file_name = timeString + ".csv" full_path = GPATH / READY_CSV_DIRECTORY / csv_file_name df.to_csv(full_path) logging.info("The final df has shape {}.".format(df.shape)) logging.info(f"Saving cleaned data to {full_path}.")
@route('/group/<group_id>') def get_trending_group(group_id): group_stories = __story_service.get_group_stories(group_id) return RestResponse(group_stories).to_json() @route('/twitter') def location_trending(): trending_hashtags = __twitter_service.get_location_trending() return RestResponse(trending_hashtags).to_json() @route('/tweets') def get_tweets(): top_tweets = __tweet_service.get_tweets() return RestResponse(top_tweets).to_json() if __name__ == "__main__": __user_service = UserService() __interest_service = InterestService() __category_service = CategoryService() __story_service = StoryService() __twitter_service = TwitterService() __tweet_service = TweetService() run(host='0.0.0.0', port=8889, server='waitress') logging.info("server running....")