Exemple #1
0
def _twitter_data_process_func(tmp_file_path: str, meta_info: dict,
                               cache_dir: str = DEFAULT_CACHE_DIR,
                               clean_up_raw_data: bool = True,
                               verbose: bool = True):
    from zipfile import ZipFile

    twitter_api = _construct_twitter_api_connection()
    
    model_name = meta_info['name']
    full_path = os.path.join(cache_dir, model_name) + meta_info['file_extension']

    with ZipFile(tmp_file_path, 'r') as zip_file:  # Extract files to cache_dir
        file_list = zip_file.namelist()
        extract_single_file_from_zip(cache_dir, file_list[0], full_path, zip_file)
    file_path = os.path.join(cache_dir, 'twitter.sentiment' + '.csv')
    df = pd.read_csv(file_path)

    twitter_ids = list(df['twitterid'])
    
    full_t = _lookup_tweets(twitter_ids, twitter_api)
    tweet_texts = [[tweet.id, tweet.full_text] for tweet in full_t]
    tweet_ids, t_texts = list(zip(*tweet_texts))
    tweet_texts_df = pd.DataFrame({'twitterid': tweet_ids, 'text': t_texts})

    resulting_df = pd.merge(df, tweet_texts_df)

    dataset_path = os.path.join(cache_dir,
                                meta_info['name'] + meta_info[
                                    'file_extension'])

    resulting_df.to_csv(dataset_path, index=False)
    
    if verbose:
        print("Downloaded {} out of {} tweets".format(len(full_t), len(twitter_ids)))
Exemple #2
0
def _unzip_process_func(tmp_file_path: str, meta_info: dict, cache_dir: str = DEFAULT_CACHE_DIR,
                        clean_up_raw_data: bool = True, verbose: bool = False, file_in_zip: str = None, singel: bool=False):
    """
    Simple process function for processing models
    that only needs to be unzipped after download.
    :param tmp_file_path: The path to the downloaded raw file
    :param clean_up_raw_data:
    :param verbose:
    :param file_in_zip: Name of the model file in the zip, if the zip contains more than one file
    """
    from zipfile import ZipFile
    
    model_name = meta_info['name']
    
    
    full_path = os.path.join(cache_dir, model_name) + meta_info['file_extension']

    
    if verbose:
        print("Unzipping {} ".format(model_name))

    with ZipFile(tmp_file_path, 'r') as zip_file:  # Extract files to cache_dir
        

        file_list = zip_file.namelist()

        if len(file_list) == 1:
            extract_single_file_from_zip(cache_dir, file_list[0], full_path, zip_file)

        elif file_in_zip:
            extract_single_file_from_zip(cache_dir, file_in_zip, full_path, zip_file)

        else:  # Extract all the files to the name of the model/dataset
            destination = os.path.join(cache_dir, meta_info['name'])
            zip_file.extractall(path=destination)