def main(self_test=False, market_app_ids=None, max_inventory_size=None, profile_id=None): if market_app_ids is None: # App: "Puzzle Box" # Reference: https://www.steamcardexchange.net/index.php?gamepage-appid-448720 market_app_ids = [448720] market_app_ids = [ int(app_id) for app_id in get_hard_coded_market_dict() ] if self_test: for market_app_id in market_app_ids: market_app_has_been_found = check_whether_items_for_given_app_exist_in_inventory_of_given_user( market_app_id=market_app_id, profile_id=profile_id, max_inventory_size=max_inventory_size) else: results = check_all_asf_bots(market_app_ids, max_inventory_size=max_inventory_size) display_results_with_markdown(results) save_to_disk(results) return True
def index(input_file, output_file_dictionary, output_file_postings): df = pandas.read_csv(input_file) dictionary = defaultdict(lambda: defaultdict(list)) doc_vector = {} doc_length_dictionary = {} # num = 1 # total_entries_len = len(list(df.itertuples(index=False))) for row in df.itertuples(index=False): # print(f'currently indexing number {num}, {num / total_entries_len * 100}% done') # num += 1 content = getattr(row, "content") document_id = getattr(row, "document_id") words = process_content(content) ctr = dict(Counter(words)) doc_vector[ document_id] = ctr # Store the document vector into a dictionary to be saved later positional_indexes_in_doc = defaultdict(list) for index, word in enumerate(words): positional_indexes_in_doc[word].append(index) for word, indexes in positional_indexes_in_doc.items(): dictionary[word][document_id] = positional_indexes_in_doc[word] # Create dictionary of document length tf_dictionary = Counter(words) log_tf_dictionary = { word: 1 + math.log(tf, 10) for word, tf in tf_dictionary.items() } length_of_log_tf_vector = math.sqrt( sum([dim * dim for dim in log_tf_dictionary.values()])) doc_length_dictionary[document_id] = length_of_log_tf_vector save_to_disk(doc_length_dictionary, "doc_length_dictionary.txt") save_to_disk(doc_vector, "doc_vector.txt") # for key, value in dictionary.items(): # dictionary[key] = sorted(value.items(), key=lambda x: x[0]) # Generates a file of human readable postings and occurences. Maily used for debugging # Each line is of the format: `word`: num_of_occurences -> `[2, 10, 34, ...]` (postings list) #generate_occurences_file(dictionary) # Uncomment the next line if needed for debugging # Saves the postings file and dictionary file to disk process_dictionary(dictionary, output_file_dictionary, output_file_postings)
def _download_video(self, vid_url: str, path_to_save=None) -> None: try: vid_resp = requests.get(vid_url, headers=utils.request_headers(), stream=True) vid_resp.raise_for_status() except: print("::-> An error occurred while requesting the file") raise # save the video file utils.save_to_disk(vid_resp, self.get_video_title(), path_to_save, is_video=True) print("Done!\n")
def download_audio(self, path_to_save=None) -> None: """ Downloads only the audio from the video. Format: .mp3 (Useful when downloading songs from YouTube) """ # check if the soup and json dict exists if not self._src_page_soup: self._create_soup() self._create_json_dict() self._video_streams, self._audio_streams = self._extract_streams() audio_src_url: str = "" for audio_stream in self._audio_streams: # apparently YT serves medium quality audio as its highest quality if audio_stream["audio_quality"] == "AUDIO_QUALITY_MEDIUM": audio_src_url: str = audio_stream["src_url"] break # clean the url first audio_src_url: str = utils.sanitize_url(audio_src_url) print("::-> Downloading the audio file...") # request the audio source try: audio_resp: requests.Response = requests.get( audio_src_url, headers=utils.request_headers(), stream=True) audio_resp.raise_for_status() except: print("::-> An error occurred while requesting the file") raise # save to disk with is_video not set utils.save_to_disk(audio_resp, self.get_video_title(), path_to_save, is_video=False) print("Done!\n")
def process_dictionary(dictionary, output_file_dictionary, output_file_postings): dictionary_to_be_saved = save_to_postings_and_generate_dictionary( dictionary, output_file_postings) save_to_disk(dictionary_to_be_saved, output_file_dictionary)
continue tweets = pd.DataFrame(tweets_data) return tweets def result(total_pos, total_neu, total_neg): each_live_result = dict(num_pos=total_pos, num_neu=total_neu, num_neg=total_neg) return each_live_result if __name__ == '__main__': tweets_data_path = 'tweets_stream/' out = {} files = get_all_files(Config.data_path + tweets_data_path, extension='json') for file in files: tweets = load_live_tweets(file) cleaned_tweets = preprocess(tweets) total_pos, total_neu, total_neg = vader_sentiment(cleaned_tweets) time_stamp = cleaned_tweets.iat[ 1, 0] # get the time stamp of each live stream file each_live_result = result(total_pos, total_neu, total_neg) out.update({time_stamp: each_live_result}) save_to_disk(data=out, path=Config.reports_path, filename='live_sentiment_summary.json')
current_chunk.append(" ".join([token for token, pos in i.leaves()])) if current_chunk: named_entity = " ".join(current_chunk) if named_entity not in continuous_chunk: continuous_chunk.append(named_entity) current_chunk = [] else: continue return continuous_chunk if __name__ == "__main__": tweets_data_path = 'tweets_by_country/' all_files = get_all_files(Config.base_path + tweets_data_path, extension='csv') out = {} for country in Config.country_prefix: df = pd.DataFrame() news_files = list(filter(lambda x: country in x, all_files)) for file in news_files: data = pd.read_csv(file, names=Config.colnames, usecols=Config.usecols_list) data.dropna(axis=0, how='any', inplace=True) df = df.append(data, ignore_index=True) text_df = filter_df(Config.keywords, df) text_list = df_to_str(text_df) text_str = '. '.join(text_list) namedEntities = get_continuous_chunks(text_str) out[country.replace('_', '')] = namedEntities save_to_disk(data=out, path=Config.base_path + 'report_named_entities/', filename='entities_by_country.json')
if __name__ == "__main__": tweets_data_path = 'tweets_by_country/' # _translation out = {} all_files = get_all_files(Config.data_path + tweets_data_path, extension='csv') for country in Config.country_prefix: # ['de_', 'fr_', 'nl_']: df = pd.DataFrame() news_files = list(filter(lambda x: country in x, all_files)) for file in news_files: data = pd.read_csv(file, names=Config.colnames, usecols=Config.usecols_list) data.dropna(axis=0, how='any', inplace=True) df = df.append(data, ignore_index=True) text_df = filter_df(Config.keywords, df) # translated_df = translation(text_df) cleaned_df = preprocess(df=text_df) # df= translated_df extreme_pos_count, total_pos, total_neu, total_neg, extreme_neg_count = extreme_vader_sentiment( text_df) out[country.replace('_', '')] = extreme_result(extreme_pos_count, total_pos, total_neu, total_neg, extreme_neg_count) save_to_disk(data=out, path=Config.reports_path, filename='all_extreme_sentiment_summary_country.json')
'country': c_prefix }, line))) entities.extend(raw) return pd.DataFrame(entities) if __name__ == "__main__": tweets_data_path = '../Data/tweets_by_country/' out = pd.DataFrame() out_count = {} all_files = get_all_files(Config.data_path + 'tweets_by_country/', extension='csv') for country in Config.country_prefix: df = pd.DataFrame() news_files = list(filter(lambda x: country in x, all_files)) for file in news_files: data = pd.read_csv(file, names=Config.colnames, usecols=Config.usecols_list) data.dropna(axis=0, how='any', inplace=True) df = df.append(data, ignore_index=True) text_df = filter_df(Config.keywords, df) cleaned_df = preprocess_less(text_df) extracted = tag(text_df, country) out_count.update( {country: extracted.entity.value_counts().head(20).to_dict()}) out = out.append(extracted) save_to_disk(out_count, Config.reports_path, 'spacy_entity_country.json') save_to_disk({'all': out.entity.value_counts().head(20).to_dict()}, Config.reports_path, 'spacy_entity_all.json')
def download(self, vid_format: str, path_to_save=None) -> None: """ Downloads the video. Current resolutions supported: all """ if not vid_format: print("\n::-> Error: quality/resolution must not be None\n") exit(1) # check if soup and json dict are created if not self._src_page_soup: self._create_soup() self._create_json_dict() self._video_streams, self._audio_streams = self._extract_streams() vid_src_url: str = None vid_wa_url: str = None # video without audio url for stream in self._video_streams: if stream["quality_label"] == vid_format: if re.search(",", stream["mime_type"]): vid_src_url: str = stream["src_url"] break else: vid_wa_url: str = stream["src_url"] break if vid_src_url: # got the source url vid_src_url: str = utils.sanitize_url(vid_src_url) print("::-> Download in progress...") # ? get the response from the src url in chunks (stream=True) try: response: requests.Response = requests.get( vid_src_url, headers=utils.request_headers(), stream=True) response.raise_for_status() except: print("::-> An error occurred while requesting the file.") raise utils.save_to_disk(response, self.get_video_title(), path_to_save, is_video=True) # endif # ? When the video and audio urls are different elif vid_wa_url: # clean the url vid_wa_url: str = utils.sanitize_url(vid_wa_url) # download audio and video files to be combined self.download_audio(path_to_save) print("::-> Downloading the video file...") self._download_video(vid_wa_url, path_to_save) # get to know which video and audio files needs to be combined if path_to_save[len(path_to_save) - 1] != "/": path_to_save += "/" vid_filelist: list = glob.glob(path_to_save + "*.mp4") last_vid_file: str = max(vid_filelist, key=os.path.getctime) audio_filelist: list = glob.glob(path_to_save + "*.mp3") last_audio_file: str = max(audio_filelist, key=os.path.getctime) # use ffmpeg to combine both, audio and video print( "::-> Combining the audio and video files into one video file..." ) # keep the console clean cmd: str = f'ffmpeg -v quiet -i "{last_vid_file}" -i "{last_audio_file}" -map 0:v:0 -map 1:a:0 "{self.get_video_title()}_final.mp4"' # finally execute the command ffmpeg_exitcode = os.system(cmd) # delete the downloaded files so that the final combined file remain try: os.remove(last_vid_file) os.remove(last_audio_file) except OSError: pass # endif print("Successfully downloaded the video/audio titled: ") print(self.get_video_title()) print("\nDownload is complete.\n")