Beispiel #1
0
def main(self_test=False,
         market_app_ids=None,
         max_inventory_size=None,
         profile_id=None):
    if market_app_ids is None:
        # App: "Puzzle Box"
        # Reference: https://www.steamcardexchange.net/index.php?gamepage-appid-448720
        market_app_ids = [448720]
        market_app_ids = [
            int(app_id) for app_id in get_hard_coded_market_dict()
        ]

    if self_test:
        for market_app_id in market_app_ids:
            market_app_has_been_found = check_whether_items_for_given_app_exist_in_inventory_of_given_user(
                market_app_id=market_app_id,
                profile_id=profile_id,
                max_inventory_size=max_inventory_size)
    else:
        results = check_all_asf_bots(market_app_ids,
                                     max_inventory_size=max_inventory_size)
        display_results_with_markdown(results)
        save_to_disk(results)

    return True
Beispiel #2
0
def index(input_file, output_file_dictionary, output_file_postings):
    df = pandas.read_csv(input_file)
    dictionary = defaultdict(lambda: defaultdict(list))
    doc_vector = {}
    doc_length_dictionary = {}
    # num = 1
    # total_entries_len = len(list(df.itertuples(index=False)))

    for row in df.itertuples(index=False):
        # print(f'currently indexing number {num}, {num / total_entries_len * 100}% done')
        # num += 1
        content = getattr(row, "content")
        document_id = getattr(row, "document_id")

        words = process_content(content)
        ctr = dict(Counter(words))
        doc_vector[
            document_id] = ctr  # Store the document vector into a dictionary to be saved later

        positional_indexes_in_doc = defaultdict(list)
        for index, word in enumerate(words):
            positional_indexes_in_doc[word].append(index)

        for word, indexes in positional_indexes_in_doc.items():
            dictionary[word][document_id] = positional_indexes_in_doc[word]

        # Create dictionary of document length
        tf_dictionary = Counter(words)
        log_tf_dictionary = {
            word: 1 + math.log(tf, 10)
            for word, tf in tf_dictionary.items()
        }
        length_of_log_tf_vector = math.sqrt(
            sum([dim * dim for dim in log_tf_dictionary.values()]))
        doc_length_dictionary[document_id] = length_of_log_tf_vector

    save_to_disk(doc_length_dictionary, "doc_length_dictionary.txt")
    save_to_disk(doc_vector, "doc_vector.txt")

    # for key, value in dictionary.items():
    #     dictionary[key] = sorted(value.items(), key=lambda x: x[0])

    # Generates a file of human readable postings and occurences. Maily used for debugging
    # Each line is of the format: `word`: num_of_occurences -> `[2, 10, 34, ...]` (postings list)
    #generate_occurences_file(dictionary)  # Uncomment the next line if needed for debugging

    # Saves the postings file and dictionary file to disk
    process_dictionary(dictionary, output_file_dictionary,
                       output_file_postings)
Beispiel #3
0
    def _download_video(self, vid_url: str, path_to_save=None) -> None:
        try:
            vid_resp = requests.get(vid_url,
                                    headers=utils.request_headers(),
                                    stream=True)
            vid_resp.raise_for_status()
        except:
            print("::-> An error occurred while requesting the file")
            raise

        # save the video file
        utils.save_to_disk(vid_resp,
                           self.get_video_title(),
                           path_to_save,
                           is_video=True)
        print("Done!\n")
Beispiel #4
0
    def download_audio(self, path_to_save=None) -> None:
        """
        Downloads only the audio from the video. 
        Format: .mp3

        (Useful when downloading songs from YouTube)
        """
        # check if the soup and json dict exists
        if not self._src_page_soup:
            self._create_soup()
            self._create_json_dict()
            self._video_streams, self._audio_streams = self._extract_streams()

        audio_src_url: str = ""
        for audio_stream in self._audio_streams:
            # apparently YT serves medium quality audio as its highest quality
            if audio_stream["audio_quality"] == "AUDIO_QUALITY_MEDIUM":
                audio_src_url: str = audio_stream["src_url"]
                break

        # clean the url first
        audio_src_url: str = utils.sanitize_url(audio_src_url)

        print("::-> Downloading the audio file...")
        # request the audio source
        try:
            audio_resp: requests.Response = requests.get(
                audio_src_url, headers=utils.request_headers(), stream=True)
            audio_resp.raise_for_status()
        except:
            print("::-> An error occurred while requesting the file")
            raise

        # save to disk with is_video not set
        utils.save_to_disk(audio_resp,
                           self.get_video_title(),
                           path_to_save,
                           is_video=False)
        print("Done!\n")
Beispiel #5
0
def process_dictionary(dictionary, output_file_dictionary,
                       output_file_postings):
    dictionary_to_be_saved = save_to_postings_and_generate_dictionary(
        dictionary, output_file_postings)
    save_to_disk(dictionary_to_be_saved, output_file_dictionary)
Beispiel #6
0
            continue
    tweets = pd.DataFrame(tweets_data)
    return tweets


def result(total_pos, total_neu, total_neg):
    each_live_result = dict(num_pos=total_pos,
                            num_neu=total_neu,
                            num_neg=total_neg)
    return each_live_result


if __name__ == '__main__':
    tweets_data_path = 'tweets_stream/'
    out = {}
    files = get_all_files(Config.data_path + tweets_data_path,
                          extension='json')
    for file in files:
        tweets = load_live_tweets(file)
        cleaned_tweets = preprocess(tweets)

        total_pos, total_neu, total_neg = vader_sentiment(cleaned_tweets)
        time_stamp = cleaned_tweets.iat[
            1, 0]  # get the time stamp of each live stream file
        each_live_result = result(total_pos, total_neu, total_neg)
        out.update({time_stamp: each_live_result})

    save_to_disk(data=out,
                 path=Config.reports_path,
                 filename='live_sentiment_summary.json')
            current_chunk.append(" ".join([token for token, pos in i.leaves()]))
        if current_chunk:
            named_entity = " ".join(current_chunk)
            if named_entity not in continuous_chunk:
                continuous_chunk.append(named_entity)
                current_chunk = []
            else:
                continue
    return continuous_chunk


if __name__ == "__main__":
    tweets_data_path = 'tweets_by_country/'
    all_files = get_all_files(Config.base_path + tweets_data_path, extension='csv')
    out = {}
    for country in Config.country_prefix:
        df = pd.DataFrame()
        news_files = list(filter(lambda x: country in x, all_files))
        for file in news_files:
            data = pd.read_csv(file, names=Config.colnames, usecols=Config.usecols_list)
            data.dropna(axis=0, how='any', inplace=True)
            df = df.append(data, ignore_index=True)
        text_df = filter_df(Config.keywords, df)
        text_list = df_to_str(text_df)
        text_str = '. '.join(text_list)
        namedEntities = get_continuous_chunks(text_str)
        out[country.replace('_', '')] = namedEntities
    save_to_disk(data=out,
                 path=Config.base_path + 'report_named_entities/',
                 filename='entities_by_country.json')
if __name__ == "__main__":
    tweets_data_path = 'tweets_by_country/'  # _translation
    out = {}
    all_files = get_all_files(Config.data_path + tweets_data_path,
                              extension='csv')
    for country in Config.country_prefix:  # ['de_', 'fr_', 'nl_']:
        df = pd.DataFrame()
        news_files = list(filter(lambda x: country in x, all_files))
        for file in news_files:
            data = pd.read_csv(file,
                               names=Config.colnames,
                               usecols=Config.usecols_list)
            data.dropna(axis=0, how='any', inplace=True)
            df = df.append(data, ignore_index=True)

        text_df = filter_df(Config.keywords, df)
        # translated_df = translation(text_df)
        cleaned_df = preprocess(df=text_df)  # df= translated_df

        extreme_pos_count, total_pos, total_neu, total_neg, extreme_neg_count = extreme_vader_sentiment(
            text_df)

        out[country.replace('_',
                            '')] = extreme_result(extreme_pos_count, total_pos,
                                                  total_neu, total_neg,
                                                  extreme_neg_count)

    save_to_disk(data=out,
                 path=Config.reports_path,
                 filename='all_extreme_sentiment_summary_country.json')
Beispiel #9
0
                'country': c_prefix
            }, line)))
        entities.extend(raw)
    return pd.DataFrame(entities)


if __name__ == "__main__":
    tweets_data_path = '../Data/tweets_by_country/'
    out = pd.DataFrame()
    out_count = {}
    all_files = get_all_files(Config.data_path + 'tweets_by_country/',
                              extension='csv')
    for country in Config.country_prefix:
        df = pd.DataFrame()
        news_files = list(filter(lambda x: country in x, all_files))
        for file in news_files:
            data = pd.read_csv(file,
                               names=Config.colnames,
                               usecols=Config.usecols_list)
            data.dropna(axis=0, how='any', inplace=True)
            df = df.append(data, ignore_index=True)
        text_df = filter_df(Config.keywords, df)
        cleaned_df = preprocess_less(text_df)
        extracted = tag(text_df, country)
        out_count.update(
            {country: extracted.entity.value_counts().head(20).to_dict()})
        out = out.append(extracted)
    save_to_disk(out_count, Config.reports_path, 'spacy_entity_country.json')
    save_to_disk({'all': out.entity.value_counts().head(20).to_dict()},
                 Config.reports_path, 'spacy_entity_all.json')
Beispiel #10
0
    def download(self, vid_format: str, path_to_save=None) -> None:
        """
        Downloads the video.
        Current resolutions supported: all
        """
        if not vid_format:
            print("\n::-> Error: quality/resolution must not be None\n")
            exit(1)

        # check if soup and json dict are created
        if not self._src_page_soup:
            self._create_soup()
            self._create_json_dict()
            self._video_streams, self._audio_streams = self._extract_streams()

        vid_src_url: str = None
        vid_wa_url: str = None  # video without audio url
        for stream in self._video_streams:
            if stream["quality_label"] == vid_format:
                if re.search(",", stream["mime_type"]):
                    vid_src_url: str = stream["src_url"]
                    break
                else:
                    vid_wa_url: str = stream["src_url"]
                    break

        if vid_src_url:
            # got the source url
            vid_src_url: str = utils.sanitize_url(vid_src_url)

            print("::-> Download in progress...")
            # ? get the response from the src url in chunks (stream=True)
            try:
                response: requests.Response = requests.get(
                    vid_src_url, headers=utils.request_headers(), stream=True)
                response.raise_for_status()
            except:
                print("::-> An error occurred while requesting the file.")
                raise

            utils.save_to_disk(response,
                               self.get_video_title(),
                               path_to_save,
                               is_video=True)

            # endif

        # ? When the video and audio urls are different
        elif vid_wa_url:
            # clean the url
            vid_wa_url: str = utils.sanitize_url(vid_wa_url)

            # download audio and video files to be combined
            self.download_audio(path_to_save)
            print("::-> Downloading the video file...")
            self._download_video(vid_wa_url, path_to_save)

            # get to know which video and audio files needs to be combined
            if path_to_save[len(path_to_save) - 1] != "/":
                path_to_save += "/"

            vid_filelist: list = glob.glob(path_to_save + "*.mp4")
            last_vid_file: str = max(vid_filelist, key=os.path.getctime)
            audio_filelist: list = glob.glob(path_to_save + "*.mp3")
            last_audio_file: str = max(audio_filelist, key=os.path.getctime)

            # use ffmpeg to combine both, audio and video
            print(
                "::-> Combining the audio and video files into one video file..."
            )

            # keep the console clean
            cmd: str = f'ffmpeg -v quiet -i "{last_vid_file}" -i "{last_audio_file}" -map 0:v:0 -map 1:a:0 "{self.get_video_title()}_final.mp4"'
            # finally execute the command
            ffmpeg_exitcode = os.system(cmd)

            # delete the downloaded files so that the final combined file remain
            try:
                os.remove(last_vid_file)
                os.remove(last_audio_file)
            except OSError:
                pass

        # endif
        print("Successfully downloaded the video/audio titled: ")
        print(self.get_video_title())
        print("\nDownload is complete.\n")