def evaluate_model_and_save(model_file):
    analysis = evaluate_model(model_file)
    print(analysis)
    filename = model_file.split(".")[0] + '_analysis.csv'
    full_path = GPATH / MODELS_PATH / filename
    analysis.to_csv(full_path)
    logging.info(f"Saved model analysis to {full_path}")
def report_class_counts():
    df = get_dataframe().drop(['filenames'], axis=1)
    class_counts = df.sum()
    filename = get_timestring() + "_class_counts.csv"
    full_path = GPATH / REPORTS_PATH / filename
    class_counts.to_csv(full_path)
    logging.info(f"Saved class count report to {full_path}")
    print(f"Report saved to {full_path}")
def report_ALL_keyword_counts():
    df = get_raw_dataframe()
    k = df['keywords']
    k = k.str.replace(' ', '_')
    k = k.str.replace(',', ' ')
    cv = CountVectorizer()
    data_cv = cv.fit_transform(k)
    data_dtm = pd.DataFrame(data_cv.toarray(), columns=cv.get_feature_names())
    data_dtm.index = k.index
    summed = data_dtm[0:50000].sum()
    summed2 = data_dtm[51000:].sum()
    summed_all = summed + summed2
    sorted_df = summed_all.sort_values(ascending=False)
    top_1000 = sorted_df.head(1000)
    filename = get_timestring() + "_top_1k_keywords.csv"
    full_path = GPATH / REPORTS_PATH / filename
    top_1000.to_csv(full_path)
    logging.info(
        f"Saved report of ALL keywords counts from data in the csvs_ready directory to {full_path}"
    )
    print(f"Report saved to {full_path}")
def watching_stories(domain_list):
    """
    watching stories of competitors
    :param domain_list: targeted competitors domain names
    :return:
    """
    fetch = fetch_time_line()
    db_category_list = __category_service.find_all_categories()
    db_interest_list = __interest_service.find_all_interests()
    for domain in domain_list:
        r = requests.get("https://api.newswhip.com/v1/publisher/" + domain + "/1?key="+newswhip_key)
        response = json.loads(r.text)
        logging.info("Domain: " + domain + " & No of Articles: " + str(len(response['articles'])))
        for item in response['articles']:
            try:
                article_info = domparser.element_picker(item['link'].encode('utf-8'))
                if article_info['title'] is not None or article_info['feature_image'] is not None or article_info['url'] is not None:
                    article = {'title': '', 'url': '', 'description': '', 'keywords': '', 'feature_image': '','New_score': '',
                            'max_new_score': '', 'fb_like': '', 'tweet_count': '', 'publisher': '', "uuid": '', 'published': '',
                               'category': [], 'interest': [], 'fetch': '', 'created_keys':[]}
                    if item['headline'] is None:
                        article['title'] = article_info['title']
                    else:
                        article['title'] = item['headline'].encode('utf-8')

                    if item['link'] is None:
                        article['url'] = article_info['url']
                    else:
                        article['url'] = item['link'].encode('utf-8')

                    if item['excerpt'] is None:
                        article['description'] = article_info['description']
                    else:
                        article['description'] = item['excerpt']

                    if item['keywords'] is None:
                        article['keywords'] = article_info['keywords']
                    else:
                        article['keywords'] = (item['keywords']).split(',')

                    if item['image_link'] is None:
                        article['feature_image'] = article_info['feature_image']
                    else:
                        article['feature_image'] = item['image_link']
                    if 'new_score' in item:
                        article['New_score'] = item['nw_score']
                    else:
                        article['New_score'] = 0
                    if 'max_new_score' in item:
                        article['max_new_score'] = item['max_nw_score']
                    else:
                        article['max_new_score'] = 0
                    if 'total_engagement_count' in item['fb_data']:
                        article['fb_like'] = item['fb_data']['total_engagement_count']
                    else:
                        article['fb_like'] = 0
                    if 'tw_count' in item['tw_data']:
                        article['tweet_count'] = item['tw_data']['tw_count']
                    else:
                        item['tw_data']['tw_count'] = 0
                    if 'publisher' in item['source']:
                        article['publisher'] = item['source']['publisher']
                    else:
                        article['publisher'] = "None"
                    if 'uuid' in item:
                        article['uuid'] = item['uuid']
                    else:
                        article['uuid'] = 'None'
                    if 'publication_timestamp' in item:
                        article['published'] = time.strftime('%Y-%m-%d %H:%M', time.localtime(item['publication_timestamp']/1000.0))
                    else:
                        article['published'] = "None"
                    article['fetch'] = current_epoch_time(datetime.now())

                    dummy_category = []
                    for i in article_info['category']:
                        split_list = i.split(',')
                        for itr in split_list:
                            if itr not in dummy_category:
                                dummy_category.append(itr.lower())

                    article_info['category'] = dummy_category
                    if not any(category['category'] in article_info['category'] for category in db_category_list):
                        for category_item in article_info['category']:
                            for interest in db_interest_list:
                                if category_item == interest['interest']:
                                    if category_item not in article['interest']:
                                        article['interest'].append(category_item)

                        if len(article['interest']) <= 0:
                            article['category'] = article_info['category']
                        else:
                            article['category'] = []
                            for int_item in article['interest']:
                                current_interest = filter(lambda member:int_item == member['interest'], db_interest_list)
                                if len(current_interest) == 1:
                                    current_category = filter(lambda member: current_interest[0]['category_id'] == member['_id'], db_category_list)
                                if len(current_category) == 1:
                                    article['category'].append(current_category[0]['category'])

                    else:
                        if article['keywords'] is not None:
                            (article['interest'], return_category_ids) = checking_interest(article['keywords'])
                        article['category'] = article_info['category']

                    key_phrases_list = []
                    raw_key_phrases_list = []
                    interest_category_id = []
                    if article_info['keywords']:
                        keywords_key_phrases = (''.join(map(str, ((article_info['keywords'][0]).decode('ascii', 'ignore')).lower()))).split(",")
                        key_phrases_list += keywords_key_phrases
                        raw_key_phrases_list += keywords_key_phrases
                    if article_info['title']:
                        title_key_phrases = extractKeyphrases(article_info['title'].decode('ascii', 'ignore'))
                        key_phrases_list += list(title_key_phrases)
                        raw_key_phrases_list.append(str(article_info['title'].decode('ascii', 'ignore')))
                    if article_info['description']:
                        description_key_phrases = extractKeyphrases(article_info['description'].decode('ascii', 'ignore'))
                        key_phrases_list += list(description_key_phrases)
                        raw_key_phrases_list.append(str(article_info['description'].decode('ascii', 'ignore')))
                    d = Counter(key_phrases_list)
                    keys_to_remove = ['', ' ', '%', 'an', 'a', ',', 'ii', 'r', 'so', 'is', 'in', 'the', 'nbt', 'us', 'them', 's', '|', 'eisamay', 'navbharat', '-navbharat', 'navbharat times', 'samay', 'india']
                    refactor_key_list = []
                    for key in list(d.keys()):
                        if (key.strip()).lower() not in keys_to_remove and (key.strip()).lower() not in refactor_key_list:
                            refactor_key_list.append((key.strip()).lower())
                    article['created_keys'] = refactor_key_list
                    if article['created_keys'] is not None:
                        (created_interest, interest_category_id) = checking_interest(raw_key_phrases_list)
                        if created_interest is not None:
                            article['interest'] += created_interest
                        if interest_category_id is not None:
                            cat_dict = Counter(interest_category_id)
                            top_order_category = ''
                            top = 0
                            for index, cat_item in enumerate(cat_dict.keys()):
                                if cat_dict[cat_dict.keys()[index]] >= top:
                                    top_order_category = cat_dict.keys()[index]
                                    top = cat_dict[cat_dict.keys()[index]]
                            if top_order_category:
                                supposed_category = __category_service.find_category(top_order_category)
                                article['category'].append(supposed_category['category'])

                    if article['interest']:
                        article['status'] = True
                    else:
                        article['status'] = False
                    __story_service.save_story(article)
                    __fetch_service.save_fetch(fetch)

            except Exception as ex:
                logging.info("Runtime Error: " + ex)
        articles = self.__story_service.find_latest_stories()
        for article in articles:
            if article["_id"] != p_article["_id"]:
                article_cosine_smlr_score = self.get_cosine_similarity(p_article["created_keys"],
                                                                       article["created_keys"])
                similar_category_score = self.get_smlr_category_score(p_article["category"], article["category"])
                article_cosine_smlr_score = article_cosine_smlr_score + similar_category_score

                if article_cosine_smlr_score >= 0.5:
                    article["group"] = "group" + str(gp_count)
                    self.__story_service.update_story(article)


if __name__ == "__main__":
    while True:
        logging.info("Scheduler initialize....")
        logging.info("Start time: " + str(datetime.now()))
        __story_service = StoryService()
        __category_service = CategoryService()
        __interest_service = InterestService()
        __fetch_service = FetchService()
        with open('credentials.json', 'r') as credential_file:
            data = json.load(credential_file)
            newswhip_key = data["key"]
            competitors = data["competitors"]
            watching_stories(competitors)
        stories = __story_service.find_latest_stories()
        if len(stories) > 100:
            logging.info("Grouping initialize....")
            __story_service.reset_group()
            gp_count = 0
Exemple #6
0
                "query":
                (trend["query"]).replace('%23',
                                         '#').replace('%22',
                                                      '"').encode('utf-8'),
                "location":
                locations[idx],
                "name":
                trend["name"].encode('utf-8')
            }
            __twitter_service.save_twitter(new_dict)
            get_top_tweets(new_dict['query'], new_dict['location'])


if __name__ == "__main__":
    while True:
        logging.info("Twitter initialize....")
        __twitter_service = TwitterService()
        __tweet_service = TweetService()
        with open('credentials.json', 'r') as credential_file:
            json_obj = json.load(credential_file)
            locations = json_obj['twitter']['location']
            locationIDs = json_obj['twitter']['locationID']

            consumer_key = json_obj['twitter']['app'][0]['TW_CONSUMER_KEY']
            consumer_secret = json_obj['twitter']['app'][0][
                'TW_CONSUMER_SECRET']
            access_token = json_obj['twitter']['app'][0]['TW_ACCESS_TOKEN']
            access_token_secret = json_obj['twitter']['app'][0][
                'TW_ACC_TOKEN_SECRET']
            auth = tweepy.OAuthHandler(consumer_key, consumer_secret)
            auth.set_access_token(access_token, access_token_secret)
    return RestResponse(all_group).to_json()


@route('/group/<group_id>')
def get_trending_group(group_id):
    group_stories = __story_service.get_group_stories(group_id)
    return RestResponse(group_stories).to_json()


@route('/twitter')
def location_trending():
    trending_hashtags = __twitter_service.get_location_trending()
    return RestResponse(trending_hashtags).to_json()


@route('/tweets')
def get_tweets():
    top_tweets = __tweet_service.get_tweets()
    return RestResponse(top_tweets).to_json()

if __name__ == "__main__":
    __user_service = UserService()
    __interest_service = InterestService()
    __category_service = CategoryService()
    __story_service = StoryService()
    __twitter_service = TwitterService()
    __tweet_service = TweetService()

    run(host='0.0.0.0', port=8889, server='waitress')
    logging.info("server running....")
def watching_stories(domain_list):
    """
    watching stories of competitors
    :param domain_list: targeted competitors domain names
    :return:
    """
    fetch = fetch_time_line()
    db_category_list = __category_service.find_all_categories()
    db_interest_list = __interest_service.find_all_interests()
    for domain in domain_list:
        r = requests.get("https://api.newswhip.com/v1/publisher/" + domain +
                         "/1?key=" + newswhip_key)
        response = json.loads(r.text)
        logging.info("Domain: " + domain + " & No of Articles: " +
                     str(len(response['articles'])))
        for item in response['articles']:
            try:
                article_info = domparser.element_picker(
                    item['link'].encode('utf-8'))
                if article_info['title'] is not None or article_info[
                        'feature_image'] is not None or article_info[
                            'url'] is not None:
                    article = {
                        'title': '',
                        'url': '',
                        'description': '',
                        'keywords': '',
                        'feature_image': '',
                        'New_score': '',
                        'max_new_score': '',
                        'fb_like': '',
                        'tweet_count': '',
                        'publisher': '',
                        "uuid": '',
                        'published': '',
                        'category': [],
                        'interest': [],
                        'fetch': '',
                        'created_keys': []
                    }
                    if item['headline'] is None:
                        article['title'] = article_info['title']
                    else:
                        article['title'] = item['headline'].encode('utf-8')

                    if item['link'] is None:
                        article['url'] = article_info['url']
                    else:
                        article['url'] = item['link'].encode('utf-8')

                    if item['excerpt'] is None:
                        article['description'] = article_info['description']
                    else:
                        article['description'] = item['excerpt']

                    if item['keywords'] is None:
                        article['keywords'] = article_info['keywords']
                    else:
                        article['keywords'] = (item['keywords']).split(',')

                    if item['image_link'] is None:
                        article['feature_image'] = article_info[
                            'feature_image']
                    else:
                        article['feature_image'] = item['image_link']
                    if 'new_score' in item:
                        article['New_score'] = item['nw_score']
                    else:
                        article['New_score'] = 0
                    if 'max_new_score' in item:
                        article['max_new_score'] = item['max_nw_score']
                    else:
                        article['max_new_score'] = 0
                    if 'total_engagement_count' in item['fb_data']:
                        article['fb_like'] = item['fb_data'][
                            'total_engagement_count']
                    else:
                        article['fb_like'] = 0
                    if 'tw_count' in item['tw_data']:
                        article['tweet_count'] = item['tw_data']['tw_count']
                    else:
                        item['tw_data']['tw_count'] = 0
                    if 'publisher' in item['source']:
                        article['publisher'] = item['source']['publisher']
                    else:
                        article['publisher'] = "None"
                    if 'uuid' in item:
                        article['uuid'] = item['uuid']
                    else:
                        article['uuid'] = 'None'
                    if 'publication_timestamp' in item:
                        article['published'] = time.strftime(
                            '%Y-%m-%d %H:%M',
                            time.localtime(item['publication_timestamp'] /
                                           1000.0))
                    else:
                        article['published'] = "None"
                    article['fetch'] = current_epoch_time(datetime.now())

                    dummy_category = []
                    for i in article_info['category']:
                        split_list = i.split(',')
                        for itr in split_list:
                            if itr not in dummy_category:
                                dummy_category.append(itr.lower())

                    article_info['category'] = dummy_category
                    if not any(category['category'] in article_info['category']
                               for category in db_category_list):
                        for category_item in article_info['category']:
                            for interest in db_interest_list:
                                if category_item == interest['interest']:
                                    if category_item not in article[
                                            'interest']:
                                        article['interest'].append(
                                            category_item)

                        if len(article['interest']) <= 0:
                            article['category'] = article_info['category']
                        else:
                            article['category'] = []
                            for int_item in article['interest']:
                                current_interest = filter(
                                    lambda member: int_item == member[
                                        'interest'], db_interest_list)
                                if len(current_interest) == 1:
                                    current_category = filter(
                                        lambda member: current_interest[0][
                                            'category_id'] == member['_id'],
                                        db_category_list)
                                if len(current_category) == 1:
                                    article['category'].append(
                                        current_category[0]['category'])

                    else:
                        if article['keywords'] is not None:
                            (article['interest'],
                             return_category_ids) = checking_interest(
                                 article['keywords'])
                        article['category'] = article_info['category']

                    key_phrases_list = []
                    raw_key_phrases_list = []
                    interest_category_id = []
                    if article_info['keywords']:
                        keywords_key_phrases = (''.join(
                            map(str, ((article_info['keywords'][0]).decode(
                                'ascii', 'ignore')).lower()))).split(",")
                        key_phrases_list += keywords_key_phrases
                        raw_key_phrases_list += keywords_key_phrases
                    if article_info['title']:
                        title_key_phrases = extractKeyphrases(
                            article_info['title'].decode('ascii', 'ignore'))
                        key_phrases_list += list(title_key_phrases)
                        raw_key_phrases_list.append(
                            str(article_info['title'].decode(
                                'ascii', 'ignore')))
                    if article_info['description']:
                        description_key_phrases = extractKeyphrases(
                            article_info['description'].decode(
                                'ascii', 'ignore'))
                        key_phrases_list += list(description_key_phrases)
                        raw_key_phrases_list.append(
                            str(article_info['description'].decode(
                                'ascii', 'ignore')))
                    d = Counter(key_phrases_list)
                    keys_to_remove = [
                        '', ' ', '%', 'an', 'a', ',', 'ii', 'r', 'so', 'is',
                        'in', 'the', 'nbt', 'us', 'them', 's', '|', 'eisamay',
                        'navbharat', '-navbharat', 'navbharat times', 'samay',
                        'india'
                    ]
                    refactor_key_list = []
                    for key in list(d.keys()):
                        if (key.strip()).lower() not in keys_to_remove and (
                                key.strip()).lower() not in refactor_key_list:
                            refactor_key_list.append((key.strip()).lower())
                    article['created_keys'] = refactor_key_list
                    if article['created_keys'] is not None:
                        (created_interest, interest_category_id
                         ) = checking_interest(raw_key_phrases_list)
                        if created_interest is not None:
                            article['interest'] += created_interest
                        if interest_category_id is not None:
                            cat_dict = Counter(interest_category_id)
                            top_order_category = ''
                            top = 0
                            for index, cat_item in enumerate(cat_dict.keys()):
                                if cat_dict[cat_dict.keys()[index]] >= top:
                                    top_order_category = cat_dict.keys()[index]
                                    top = cat_dict[cat_dict.keys()[index]]
                            if top_order_category:
                                supposed_category = __category_service.find_category(
                                    top_order_category)
                                article['category'].append(
                                    supposed_category['category'])

                    if article['interest']:
                        article['status'] = True
                    else:
                        article['status'] = False
                    __story_service.save_story(article)
                    __fetch_service.save_fetch(fetch)

            except Exception as ex:
                logging.info("Runtime Error: " + ex)
        for article in articles:
            if article["_id"] != p_article["_id"]:
                article_cosine_smlr_score = self.get_cosine_similarity(
                    p_article["created_keys"], article["created_keys"])
                similar_category_score = self.get_smlr_category_score(
                    p_article["category"], article["category"])
                article_cosine_smlr_score = article_cosine_smlr_score + similar_category_score

                if article_cosine_smlr_score >= 0.5:
                    article["group"] = "group" + str(gp_count)
                    self.__story_service.update_story(article)


if __name__ == "__main__":
    while True:
        logging.info("Scheduler initialize....")
        logging.info("Start time: " + str(datetime.now()))
        __story_service = StoryService()
        __category_service = CategoryService()
        __interest_service = InterestService()
        __fetch_service = FetchService()
        with open('credentials.json', 'r') as credential_file:
            data = json.load(credential_file)
            newswhip_key = data["key"]
            competitors = data["competitors"]
            watching_stories(competitors)
        stories = __story_service.find_latest_stories()
        if len(stories) > 100:
            logging.info("Grouping initialize....")
            __story_service.reset_group()
            gp_count = 0
Exemple #10
0
from app.modules import *
from app.config import logging

logging.info("\nProgram started from here.")
driver = connecting_with_whatsapp()

# Fetching data from Excel/File:
to_forward, pause_time = retrieve_file_parameter()
pause_time = int(pause_time)
employee = r"EPL"
client = r"CLT"

ss_path = os.path.join(os.getcwd(), "app", "screenshots")
ss_path = ss_path + os.sep
# Data Fetched...

while True:
    logging.info("\nWhile Loop started here.\n")
    # Deleting already exsisting screenshots.
    delete_ss(ss_path)

    check_unread_from_these(driver, ss_path, employee, client)

    send_ss(driver, ss_path, to_forward)
    print("Successfully send the screenshots!!!...")

    sleep(randint(7, 10))

    logging.info("\nWhile Loop ended here.\n")
    logging.info("\n\nWaiting for " + str(pause_time) + " minutes to re-run the bot.\n\n")
def clean_csv_data_and_images():

    # Read in all csvs from the RAW_CSV_DIRECTORY and concat them into one dataframe
    files = glob.glob("{}/*.csv".format(GPATH / RAW_CSV_DIRECTORY))
    logging.info("Globbing {} csvs for cleaning.".format(len(files)))
    df = pd.concat((pd.read_csv(f) for f in files))
    logging.info("The df has shape {}.".format(df.shape))

    df.set_index('id', inplace=True)

    # Drop duplicates (based on id)
    count_before = len(df.index)
    df = df[~df.index.duplicated(keep='first')]
    count_after = len(df.index)
    logging.info(
        f'Dropped {count_before-count_after} items with duplicate indices.')

    # Drop na
    df.dropna(inplace=True)

    # Keyword preprocessing
    k = df['keywords']
    k = k.str.replace(' ', '_')
    k = k.str.replace(',', ' ')
    df.keywords = k

    # Get the filenames of the images cleanly without any paths
    filenames_list = []
    for i, row in df.iterrows():
        try:
            filename = eval(row.images)[0]['path'].split('/')[1]
        except:
            filename = ""
        filenames_list.append(filename)
    df['filename'] = filenames_list
    # Drop the images column
    df.drop(['images', 'searchTerm', 'pageNum', 'contributer', 'type'],
            axis=1,
            inplace=True)

    # Ensure the files really are there
    badIds = []
    for i, row in df.iterrows():
        fn = GPATH / RAW_IMAGE_DIRECTORY / 'full' / row.filename
        if not path.exists(fn):
            badIds.append(i)
    if len(badIds) > 0:
        logging.warning(
            f'Found {len(badIds)} ids with no image file. Dropping them from the cleaned df.'
        )
        df.drop(badIds, inplace=True)
    else:
        logging.info('All files checked and exist while cleaning the data.')

    # Sort by index
    df.sort_index(inplace=True)
    print(df.info())

    # Clean the images
    fix_all_with_threads(df)

    # Drop a few more columns
    df.drop(['image_urls', 'filename'], axis=1, inplace=True)

    # Save out a compiled and cleaned csv
    now = datetime.now()
    timeString = now.strftime("%Y%m%d%H%M%S%f")
    csv_file_name = timeString + ".csv"

    full_path = GPATH / READY_CSV_DIRECTORY / csv_file_name
    df.to_csv(full_path)
    logging.info("The final df has shape {}.".format(df.shape))
    logging.info(f"Saving cleaned data to {full_path}.")

@route('/group/<group_id>')
def get_trending_group(group_id):
    group_stories = __story_service.get_group_stories(group_id)
    return RestResponse(group_stories).to_json()


@route('/twitter')
def location_trending():
    trending_hashtags = __twitter_service.get_location_trending()
    return RestResponse(trending_hashtags).to_json()


@route('/tweets')
def get_tweets():
    top_tweets = __tweet_service.get_tweets()
    return RestResponse(top_tweets).to_json()


if __name__ == "__main__":
    __user_service = UserService()
    __interest_service = InterestService()
    __category_service = CategoryService()
    __story_service = StoryService()
    __twitter_service = TwitterService()
    __tweet_service = TweetService()

    run(host='0.0.0.0', port=8889, server='waitress')
    logging.info("server running....")