コード例 #1
0
def add_features(input_file, output_file, force):
    """ Runs build features scripts to turn processed data from (../processed) into
        improved data (saved in ../processed as well).

        Parameters
        ----------
        input_file: str
            Input file to be processed
        output_file: str
            Output processed file
        force: bool
            Force to process the input file
    """
    spinner = Halo(text='Building features...', spinner='dots')

    clean_data = pd.read_csv(input_file)

    # Add lat/lon columns
    if force or not os.path.exists(output_file):
        spinner.start("Adding Latitude and Longitude columns")
        transformed_data = apply_nomatin(clean_data)
        transformed_data.to_csv(output_file, index=False)
        spinner.succeed("Latitude and Longitude features added!")
    else:
        spinner.start("Loading transformed file...")
        time.sleep(2)
        transformed_data = pd.read_csv(output_file)
        spinner.stop_and_persist(text="Transformed file already exists!")

    # Combine features
    transformed_data = combine_features(transformed_data)

    transformed_data.to_csv(output_file, index=False)

    return transformed_data
コード例 #2
0
def process_dataset(input_file, output_file, scrape):
    """ Runs data processing scripts to turn raw data from (../raw) into
        cleaned data ready to be analyzed (saved in ../processed).

        Parameters
        ----------
        input_file: str
            Input file to be processed
        output_file: str
            Output processed file
        scrape: bool
            Force the scraping process
    """
    spinner = Halo(text='Making dataset...', spinner='dots')
    logger = logging.getLogger(__name__)
    logger.info('Making final dataset from raw data')
    # Scrape data
    if scrape or not os.path.exists(input_file):
        spinner.start("Scraping data")
        with open('./references/urls.txt', 'r') as f:
            urls = f.readlines()
        scraped_dfs = []
        for url in urls:
            scraped_dfs.append(navigate(url, 1, 500))
        # Save results
        raw_data = pd.concat(scraped_dfs)
        raw_data.to_csv(input_file, index=False)
        spinner.succeed("Data Scrapped!")
    else:
        spinner.succeed("Loading scraped file...")
        raw_data = pd.read_csv(input_file)
        spinner.succeed("Scraped file already exists!")

    # Remove duplicates
    spinner.start("Removing duplicates and invalid values...")
    time.sleep(1)
    interim_data = remove_duplicates_and_na(raw_data)
    interim_data.to_csv(output_file.replace("processed", "interim"),
                        index=False)
    spinner.succeed("Done removing duplicates!")

    # Remove outliers
    spinner.start("Removing outliers and inconsistent values...")
    time.sleep(1)
    final_data = remove_outliers(interim_data)
    final_data.to_csv(output_file, index=False)
    spinner.succeed("Done removing outliers!")
    spinner.start("Cleaning processing done!")
    spinner.stop_and_persist(symbol='✔'.encode('utf-8'),
                             text="Cleaning processing done!")

    return final_data