def _twitter_data_process_func(tmp_file_path: str, meta_info: dict, cache_dir: str = DEFAULT_CACHE_DIR, clean_up_raw_data: bool = True, verbose: bool = True): from zipfile import ZipFile twitter_api = _construct_twitter_api_connection() model_name = meta_info['name'] full_path = os.path.join(cache_dir, model_name) + meta_info['file_extension'] with ZipFile(tmp_file_path, 'r') as zip_file: # Extract files to cache_dir file_list = zip_file.namelist() extract_single_file_from_zip(cache_dir, file_list[0], full_path, zip_file) file_path = os.path.join(cache_dir, 'twitter.sentiment' + '.csv') df = pd.read_csv(file_path) twitter_ids = list(df['twitterid']) full_t = _lookup_tweets(twitter_ids, twitter_api) tweet_texts = [[tweet.id, tweet.full_text] for tweet in full_t] tweet_ids, t_texts = list(zip(*tweet_texts)) tweet_texts_df = pd.DataFrame({'twitterid': tweet_ids, 'text': t_texts}) resulting_df = pd.merge(df, tweet_texts_df) dataset_path = os.path.join(cache_dir, meta_info['name'] + meta_info[ 'file_extension']) resulting_df.to_csv(dataset_path, index=False) if verbose: print("Downloaded {} out of {} tweets".format(len(full_t), len(twitter_ids)))
def _unzip_process_func(tmp_file_path: str, meta_info: dict, cache_dir: str = DEFAULT_CACHE_DIR, clean_up_raw_data: bool = True, verbose: bool = False, file_in_zip: str = None, singel: bool=False): """ Simple process function for processing models that only needs to be unzipped after download. :param tmp_file_path: The path to the downloaded raw file :param clean_up_raw_data: :param verbose: :param file_in_zip: Name of the model file in the zip, if the zip contains more than one file """ from zipfile import ZipFile model_name = meta_info['name'] full_path = os.path.join(cache_dir, model_name) + meta_info['file_extension'] if verbose: print("Unzipping {} ".format(model_name)) with ZipFile(tmp_file_path, 'r') as zip_file: # Extract files to cache_dir file_list = zip_file.namelist() if len(file_list) == 1: extract_single_file_from_zip(cache_dir, file_list[0], full_path, zip_file) elif file_in_zip: extract_single_file_from_zip(cache_dir, file_in_zip, full_path, zip_file) else: # Extract all the files to the name of the model/dataset destination = os.path.join(cache_dir, meta_info['name']) zip_file.extractall(path=destination)