def restore_database(self):
        if not os.path.exists(Args.backup_db_dir()):
            os.makedirs(Args.backup_db_dir())

        cns_command = f'"{self.__mongorestore_path}" "{os.path.abspath(Args.backup_db_dir())}"'

        subprocess.check_output(cns_command)
    def backup_database(self):
        if not os.path.exists(Args.backup_db_dir()):
            os.makedirs(Args.backup_db_dir())

        cns_command = f'"{self.__mongodump_path}" --collection videos --db videos_analysis' \
            f' --out "{os.path.abspath(Args.backup_db_dir())}"'

        subprocess.check_output(cns_command)
Beispiel #3
0
def get_args():
    args = Args()
    add_standard_args(args)
    add_agent_args(args)
    all_args = args.all.parse_args()
    agent_args = args.agent.parse_known_args()
    return all_args, agent_args
Beispiel #4
0
    def __detailed_analysis_for_all_countries(self):
        os.system('cls')
        print('Please, wait...')

        data = self.__db.get_videos_by_country_codes(list(
            self.__country_codes))
        data_frame = pd.DataFrame(data)

        del data

        if data_frame.size > 0:
            output_directory = os.path.join(
                Args.analysis_res_dir(),
                f'all_country{os.sep}{time.strftime("%d.%m.%y")}{os.sep}')

            print('>>> General analysis is carried out')
            self.__general_analysis_for_data(data_frame, output_directory)
            print('>>> General report is completed!')

            print('>>> Detailed analysis is carried out')
            self.__detailed_analysis_for_data(data_frame, output_directory)
            print('>>> Detailed analysis is completed!')

            # subprocess.Popen(f'explorer /select, {output_directory}')
            os.startfile(output_directory)
        else:
            print('No data for analysis!')

        del data_frame
Beispiel #5
0
    def launch(self,
               hours=23,
               minutes=30,
               country_codes_path=Args.country_codes_path()):

        scrap_time = datetime.combine(date.today(), time(hours, minutes))

        while True:
            current_time = datetime.today()
            delta_time = (scrap_time - current_time)
            scrap_time = scrap_time.fromtimestamp(scrap_time.timestamp() +
                                                  abs(delta_time.days) * 86400)

            print(
                f'>>> Next scrap will be {scrap_time.strftime("%Y.%m.%d-%H:%M:%S")}'
            )

            t.sleep(delta_time.seconds)

            new_data = match_category_id_with_category_title(
                self.scraper.get_videos_data_by_country_codes_from_file(
                    country_codes_path))

            print(f'>>> New {len(new_data)} data videos received!')

            count = self.db.save_many_videos(new_data)

            print(f'>>> Saved {count} data videos to database!')

            save_videos_data_into_csv(new_data)
Beispiel #6
0
def __create_and_save_word_cloud(data,
                                 filename,
                                 output_dir=Args.analysis_res_dir(),
                                 user_stopwords=STOPWORDS,
                                 bg_color='black',
                                 max_words=100,
                                 max_font_size=120):
    plt.figure(figsize=(20, 20))
    cloud = WordCloud(stopwords=user_stopwords,
                      background_color=bg_color,
                      max_words=max_words,
                      max_font_size=max_font_size,
                      width=1600,
                      height=800).generate(data)

    plt.imshow(cloud)
    plt.axis('off')

    if not os.path.exists(output_dir):
        os.makedirs(output_dir)

    plt.savefig(os.path.join(output_dir, filename),
                facecolor='k',
                bbox_inches='tight')
    plt.close()
Beispiel #7
0
def views_likes_dislikes_comments_normal_distribution(
    data: DataFrame, output_dir=Args.analysis_res_dir()):
    data['likes_log'] = np.log(data['likes'] + 1)
    data['view_count_log'] = np.log(data['view_count'] + 1)
    data['dislikes_log'] = np.log(data['dislikes'] + 1)
    data['comment_log'] = np.log(data['comment_count'] + 1)

    plt.figure(figsize=(12, 6))

    plt.subplot(221)
    g1 = sns.distplot(data['view_count_log'])
    g1.set_title("VIEWS LOG DISTRIBUTION", fontsize=16)

    plt.subplot(224)
    g2 = sns.distplot(data['likes_log'], color='green')
    g2.set_title('LIKES LOG DISTRIBUTION', fontsize=16)

    plt.subplot(223)
    g3 = sns.distplot(data['dislikes_log'], color='r')
    g3.set_title("DISLIKES LOG DISTRIBUTION", fontsize=16)

    plt.subplot(222)
    g4 = sns.distplot(data['comment_log'])
    g4.set_title("COMMENTS LOG DISTRIBUTION", fontsize=16)

    plt.subplots_adjust(wspace=0.2, hspace=0.4, top=0.9)

    __save_figure(plt, output_dir, 'normal_distribution.png')
    plt.close()
Beispiel #8
0
def category_rating(data: DataFrame, output_dir=Args.analysis_res_dir()):
    plt.figure(figsize=(30, 9))
    plot = sns.countplot(data['category'],
                         order=data['category'].value_counts().index)
    plot.set_title("Counting the Video Category's ", fontsize=20)
    plot.set_xlabel("", fontsize=20)
    plot.set_ylabel("Count", fontsize=20)

    __save_figure(plot.get_figure(), output_dir, 'category_rating.png')
    plt.close()
    def __init__(self, uri=Args.db_host()):
        self.__client = MongoClient(uri)

        self.__db = self.__client["videos_analysis"]

        self.__videos_coll = self.__db["videos"]
        self.__videos_coll.create_index([("county_code", pymongo.DESCENDING)])

        self.__mongodump_path = os.getenv('MONGODUMP_PATH')
        self.__mongorestore_path = os.getenv('MONGORESTORE_PATH')
Beispiel #10
0
def distribution_of_days_histogram(data: DataFrame,
                                   output_dir=Args.analysis_res_dir()):
    data = distribution_of_days_preprocessing(data)

    plt.figure(figsize=(20, 9))

    plot = sns.countplot(data['interval'])
    plt.title('Distribution of interval')

    __save_figure(plot.get_figure(), output_dir,
                  'distribution_of_days_histogram.png')
    plt.close()
    def __add_country_codes_from_file(self):
        os.system('cls')
        file_path = input(
            f'Enter file path (Default="{Args.country_codes_path()}"): ')

        if len(file_path) == 0:
            file_path = Args.country_codes_path()

        try:
            self.__add_country_codes(get_data_from_file(file_path))

        except FileNotFoundError as e:
            print(f'{e.strerror}: "{e.filename}"')
Beispiel #12
0
def word_cloud_for_titles(data: DataFrame, output_dir=Args.analysis_res_dir()):
    title_word = data['title'].str.lower().str.cat(sep=' ')

    # word_tokens = word_tokenize(title_word)
    # filtered_sentence = [w for w in word_tokens if not w in all_stopwords]
    # without_single_chr = [word for word in filtered_sentence if len(word) > 2]
    # cleaned_data_title = [word for word in without_single_chr if not word.isdigit()]

    __create_and_save_word_cloud(data=title_word,
                                 filename='word_cloud_for_titles.png',
                                 user_stopwords=USER_STOPWORDS,
                                 output_dir=output_dir)

    del title_word
Beispiel #13
0
    def __detailed_analysis_for_each_country_separately(self):
        os.system('cls')
        print('Please, wait...')

        is_analyze = False

        for code in self.__country_codes:

            print(f'COUNTRY: {pycountry.countries.get(alpha_2=code).name}')

            data = self.__db.get_videos_by_country_code(code)
            data_frame = pd.DataFrame(data)

            del data

            if data_frame.size == 0:
                print(f'No data for analysis {code}!')
                continue

            is_analyze = True

            output_directory = os.path.join(
                Args.analysis_res_dir(),
                f'{code}{os.sep}{time.strftime("%d.%m.%y")}{os.sep}')

            print('>>> General analysis is carried out')
            self.__general_analysis_for_data(data_frame, output_directory)
            print('>>> General report is completed!')

            print('>>> Detailed analysis is carried out')
            self.__detailed_analysis_for_data(data_frame, output_directory)
            print('>>> Detailed analysis is completed!')

            del data_frame

        if is_analyze:
            os.startfile(Args.analysis_res_dir())
Beispiel #14
0
def distribution_plot(data: DataFrame, output_dir=Args.analysis_res_dir()):
    general_view = pd.DataFrame(
        data[['view_count', 'likes', 'dislikes',
              'comment_count']].groupby(data['category']).mean())

    plt.figure(figsize=(32, 20))

    plt.subplot(2, 2, 1)
    plt.plot(general_view.index,
             'view_count',
             data=general_view,
             color='blue',
             linewidth=2,
             linestyle='solid')
    plt.title('View_count vs Category')
    plt.xticks(rotation=30)

    plt.subplot(2, 2, 2)
    plt.plot(general_view.index,
             'likes',
             data=general_view,
             color='green',
             linewidth=2,
             linestyle='dotted')
    plt.title('Likes vs Category')
    plt.xticks(rotation=30)

    plt.subplot(2, 2, 3)
    plt.plot(general_view.index,
             'dislikes',
             data=general_view,
             color='black',
             linewidth=2,
             linestyle='dashed')
    plt.title('Dislikes vs Category')
    plt.xticks(rotation=30)

    plt.subplot(2, 2, 4)
    plt.plot(general_view.index,
             'comment_count',
             data=general_view,
             color='red',
             linewidth=2,
             linestyle='dashdot')
    plt.title('Comment_count vs Category')
    plt.xticks(rotation=30)

    __save_figure(plt, output_dir, 'distribution_plot.png')
    plt.close()
Beispiel #15
0
def distribution_of_average_time(data: DataFrame,
                                 output_dir=Args.analysis_res_dir()):
    data = distribution_of_days_preprocessing(data)

    df_t = pd.DataFrame(data['interval'].groupby(
        data['category']).mean()).sort_values(by="interval")
    plt.figure(figsize=(20, 9))
    plt.plot(df_t, color='skyblue', linewidth=2)
    plt.title("Average Days to be trending video", fontsize=20)
    plt.xlabel('Category', fontsize=16)
    plt.ylabel('Average Time Interval', fontsize=16)
    plt.xticks(rotation=30)

    __save_figure(plt, output_dir, 'distribution_of_average_time.png')
    plt.close()
def save_videos_data_into_csv(
        videos_data: list,
        file_name=f"{time.strftime('%d-%m-%y_%H.%M.%S')}_videos.csv",
        output_dir=Args.raw_data_dir()):
    if videos_data is None:
        raise ValueError('Videos data can`t be None!')

    csv_data = [','.join(videos_data[0].keys())]

    for video in videos_data:
        if video is None:
            continue

        csv_data.append(','.join([prepare_feature_for_csv(val) for val in video.values()]))

    write_to_file(
        output_dir,
        file_name,
        csv_data)
Beispiel #17
0
def match_category_id_with_category_title(videos_data: list,
                                          category_id_file_path=None) -> list:
    if videos_data is None:
        raise ValueError('Videos data can`t be None!')

    if category_id_file_path is None:
        category_id_file_path = Args.category_id_file_path()

    categories = {}
    with open(category_id_file_path, 'r') as f:
        category = json.load(f)
        for i in category['items']:
            categories[int(i['id'])] = i['snippet']['title']

    for video in videos_data:
        if 'category' not in video:
            category_id = video.pop('category_id')
            video['category'] = categories.get(int(category_id), category_id)

    return videos_data
Beispiel #18
0
def distribution_boxplot(data: DataFrame, output_dir=Args.analysis_res_dir()):
    view_count = np.log(data['view_count'] + 1)
    likes = np.log(data['likes'] + 1)
    dislikes = np.log(data['dislikes'] + 1)
    comment = np.log(data['comment_count'] + 1)

    data_count = pd.concat([view_count, likes, dislikes, comment], axis=1)
    data_count.index = data['category']
    data_count = data_count[(data_count != 0)]

    plt.figure(figsize=(32, 20))
    plt.subplot(2, 2, 1)
    sns.boxplot(data_count.index,
                'view_count',
                data=data_count,
                order=data['category'].value_counts().index)
    plt.xticks(rotation=30, fontsize=12)

    plt.subplot(2, 2, 2)
    sns.boxplot(data_count.index,
                'likes',
                data=data_count,
                order=data['category'].value_counts().index)
    plt.xticks(rotation=30, fontsize=12)

    plt.subplot(2, 2, 3)
    sns.boxplot(data_count.index,
                'dislikes',
                data=data_count,
                order=data['category'].value_counts().index)
    plt.xticks(rotation=30, fontsize=12)

    plt.subplot(2, 2, 4)
    sns.boxplot(data_count.index,
                'comment_count',
                data=data_count,
                order=data['category'].value_counts().index)
    plt.xticks(rotation=30, fontsize=12)

    __save_figure(plt, output_dir, 'distribution_boxplot.png')
    plt.close()
Beispiel #19
0
    def __init__(self):

        self.__api_key = Args.api_key()
Beispiel #20
0
def add_standard_args(args:Args):
    args.add(
        '-e', '--env-id', nargs='?', default='Deepdrive-v0',
        help='Select the environment to run')
    args.add(
        '-r', '--record', action='store_true', default=False,
        help='Records game driving, including recovering from random actions')
    args.add(
        '--discrete-actions', action='store_true', default=False,
        help='Use discrete, rather than continuous actions')
    args.add(
        '--recording-dir', nargs='?', default=c.RECORDING_DIR,
        help='Where to store and read recorded environment data from')
    args.add(
        '--render', action='store_true', default=False,
        help='Show the cameras as seen your agents in Python')
    args.add(
        '--sync', action='store_true', default=False,
        help='Use synchronous stepping mode where the simulation advances only '
             'when calling step')
    args.add(
        '--sim-step-time', type=float,
        default=c.DEFAULT_SIM_STEP_TIME,
        help='Time to pause sim in synchronous stepping mode')
    args.add(
        '--enable-traffic', action='store_true', default=False,
        help='Enable traffic within the simulator')
    args.add(
        '--randomize-sun-speed', action='store_true', default=False,
        help='Whether to randomize the virtual speed of the earth\'s orbit '
             'around the sun')
    args.add(
        '--randomize-view-mode', action='store_true', default=False,
        help='Whether to randomize view mode on episode reset')
    args.add(
        '--randomize-shadow-level', action='store_true', default=False,
        help='Whether to randomize virtual position of Earth around Sun via '
             'month')
    args.add(
        '--randomize-month', action='store_true', default=False,
        help='Whether to randomize shadow quality render levels')
    args.add(
        '--path-follower', action='store_true', default=False,
        help='Whether to let the in-game path follower drive')
    args.add(
        '--eval-only', action='store_true', default=False,
        help='Whether to just run evaluation, i.e. disable gradient updates', )
    args.add(
        '--driving-style', nargs='?',
        default=DrivingStyle.NORMAL.as_string(),
        help='Speed vs comfort prioritization, i.e. ' +
             ', '.join([level.name.lower() for level in
                        DrivingStyle]))
    args.add(
        '--remote', action='store_true', default=False,
        help='Use API to connect to a remote environment')
    args.add(
        '-v', '--verbose',
        help='Increase output verbosity', action='store_true')
    args.add(
        '--camera-rigs', nargs='?', default=None,
        help='Name of camera rigs to use')
    args.add(
        '--experiment', nargs='?', default=None,
        help='Name of your experiment')
    args.add(
        '--fps', type=int, default=c.DEFAULT_FPS,
        help='Frames or steps per second')
    args.add(
        '--ego-mph', type=float, default=25,
        help='Ego (i.e. main) agent vehicle miles per hour')
    args.add(
        '--view-mode-period', type=int, default=None,
        help='Number of steps between view mode switches')
    args.add(
        '--max-steps', type=int, default=None,
        help='Max number of steps to run per episode')
    args.add(
        '--max-episodes', type=int, default=None,
        help='Maximum number of episodes')
    args.add(
        '--server', action='store_true', default=False,
        help='Run as an API server - serializes in pyarrow', )
    args.add(
        '--json-server', action='store_true', default=False,
        help='Run as a JSON API server - serializes with JSON', )
    args.add(
        '--upload-gist', action='store_true', default=False,
        help='Upload a private gist with driving performance'
             'stats csv files', )
    args.add(
        '--public', action='store_true', default=False,
        help='Results will be made public, i.e. artifacts like '
             'https://gist.github.com/deepdrive-results/cce0a164498c17269ce2adea2a88ec95', )

    args.add(
        '--image-resize-dims', nargs='?',
        default=json.dumps(MOBILENET_V2_IMAGE_SHAPE),
        help='Resize the image coming from the cameras. This was added as '
             'we trained MNET (224x224) on old AlexNet data (227x227), and'
             'wanted to test using the same transformation.')
    args.add(
        '--update-sim', action='store_true', default=False,
        help='Update sim to the latest version', )

    args.add(
        '--scenario', type=int, default=c.DEFAULT_SCENARIO_INDEX,
        help='Scenario index to run 0-5 are Kevindale scenarios')

    args.add('--map', nargs='?', default='',
        help='The Unreal Map to load - options: ' +
             ', '.join(c.MAP_LOOKUP.keys()))
Beispiel #21
0
def correlation(data: DataFrame, output_dir=Args.analysis_res_dir()):
    corr = data[['view_count', 'likes', 'dislikes', 'comment_count']].corr()
    plot = sns.heatmap(corr, cmap='Blues', annot=True)

    __save_figure(plot.get_figure(), output_dir, 'correlation.png')
    plt.close()
Beispiel #22
0
def sentiment_analysis(data: DataFrame, output_dir=Args.analysis_res_dir()):
    category_list = data['category'].unique()

    # Collect all the related stopwords.
    en_stopwords = list(stopwords.words('english'))
    de_stopwords = list(stopwords.words('german'))
    fr_stopwords = list(stopwords.words('french'))
    ru_stopwords = list(stopwords.words('russian'))

    en_stopwords.extend(de_stopwords)
    en_stopwords.extend(fr_stopwords)
    en_stopwords.extend(ru_stopwords)

    polarities = list()
    MAX_N = 10000

    for i in category_list:
        print(f'>> {i}')

        tags_word = data[data['category'] == i]['tags'].str.lower().str.cat(
            sep=' ')

        # removes punctuation,numbers and returns list of words
        tags_word = re.sub('[^A-Za-z]+', ' ', tags_word)
        word_tokens = word_tokenize(tags_word)
        filtered_sentence = [w for w in word_tokens if not w in en_stopwords]
        without_single_chr = [
            word for word in filtered_sentence if len(word) > 2
        ]

        # Remove numbers
        cleaned_data_title = [
            word for word in without_single_chr if not word.isdigit()
        ]

        # Calculate frequency distribution
        word_dist = nltk.FreqDist(cleaned_data_title)
        hnhk = pd.DataFrame(word_dist.most_common(MAX_N),
                            columns=['Word', 'Frequency'])
        compound = .0
        for word in hnhk['Word'].head(MAX_N):
            compound += SentimentIntensityAnalyzer().polarity_scores(
                word)['compound']

        polarities.append(compound)

    category_list = pd.DataFrame(category_list)
    polarities = pd.DataFrame(polarities)
    tags_sentiment = pd.concat([category_list, polarities], axis=1)
    tags_sentiment.columns = ['category', 'polarity']
    tags_sentiment = tags_sentiment.sort_values('polarity').reset_index()

    plt.figure(figsize=(18, 10))
    sns.barplot(x=tags_sentiment['polarity'],
                y=tags_sentiment['category'],
                data=tags_sentiment)

    plt.xlabel("Categories", fontsize=20)
    plt.ylabel("Polarity", fontsize=20)
    plt.yticks(fontsize=15)
    plt.xticks(fontsize=15)
    plt.title("\nPolarity of Different Categories videos\n", fontsize=25)

    __save_figure(plt, output_dir, 'polarity_of_categories.png')
    plt.close()