Ejemplo n.º 1
0
 def test_remove_unused_categories_in_df(self):
     df = turn_columns_to_categorical(
         pd.DataFrame({
             'month': [4, 4, 4],
             'day': [5, 6, 7],
             'day_of_week': [1, 2, 3]
         }), ['month', 'day'])
     assert_array_equal(df.day.cat.categories.values, [5, 6, 7])
     df = filter_df(df, 'day', 5)
     assert_array_equal(df.day.cat.categories.values, [5, 6, 7])
     df = remove_unused_categories_in_df(df)
     assert_array_equal(df.day.cat.categories.values, [5])
Ejemplo n.º 2
0
 def test_filter_df(self):
     assert_frame_equal(
         filter_df(
             pd.DataFrame([4, 4, 4, 5, 5, 5, 6, 6, 6], columns=['month']),
             'month', 4), pd.DataFrame({'month': [4, 4, 4]}))
if __name__ == "__main__":
    tweets_data_path = 'tweets_by_country/'  # _translation
    out = {}
    all_files = get_all_files(Config.data_path + tweets_data_path,
                              extension='csv')
    for country in Config.country_prefix:  # ['de_', 'fr_', 'nl_']:
        df = pd.DataFrame()
        news_files = list(filter(lambda x: country in x, all_files))
        for file in news_files:
            data = pd.read_csv(file,
                               names=Config.colnames,
                               usecols=Config.usecols_list)
            data.dropna(axis=0, how='any', inplace=True)
            df = df.append(data, ignore_index=True)

        text_df = filter_df(Config.keywords, df)
        # translated_df = translation(text_df)
        cleaned_df = preprocess(df=text_df)  # df= translated_df

        extreme_pos_count, total_pos, total_neu, total_neg, extreme_neg_count = extreme_vader_sentiment(
            text_df)

        out[country.replace('_',
                            '')] = extreme_result(extreme_pos_count, total_pos,
                                                  total_neu, total_neg,
                                                  extreme_neg_count)

    save_to_disk(data=out,
                 path=Config.reports_path,
                 filename='all_extreme_sentiment_summary_country.json')
        for start in range(0, int((mfcc.shape[1] - window_size) / stride) + 1):
            chunk.append(mfcc[:,
                              start * stride:(start * stride + window_size)])
            chunk_labels.append(label)
    return (chunk, chunk_labels)


if __name__ == '__main__':
    """
    Classify Audio
    """
    args = get_args()

    if args.load_cache:
        df = pd.read_csv(args.source_csv)
        df = filter_df(df)
        factor = pd.factorize(df['native_language'])
        df['native_language'] = factor[0]
        language_set = factor[1].values
        del factor

        train_X, test_X, train_Y, test_Y = split_data(df, test_split=0.2)
        train_Y, test_Y = train_Y.values, test_Y.values
        train_size, test_size = len(train_X), len(test_X)
        pickle.dump([train_Y, test_Y], open('train_labels.dump', 'wb'))

        logging.basicConfig(
            level=(logging.DEBUG if args.debug else logging.INFO))
        logger = logging.getLogger(__name__)
        logger.debug(" Training set size = {}".format(train_size))
        logger.debug(" Testing set size = {}".format(test_size))
Ejemplo n.º 5
0
def process_election(election_date):

    service = build('sheets', 'v4', credentials=creds)

    dfs = []
    requests_payload = []

    sheet_id = 0
    file_name = 'results_pct'
    snake_date = election_date[0:4] + '_' + \
        election_date[4:6] + '_' + election_date[6:]
    url = f'https://dl.ncsbe.gov/ENRS/{snake_date}/{file_name}_{election_date}.zip'
    print(url)
    data = utils.get_zipfile(url, f'{file_name}_{election_date}.txt')
    raw_results_df = pd.read_csv(data, delimiter='\t')
    raw_results_df = utils.filter_df(raw_results_df)
    raw_results_df = raw_results_df.sort_values(['Contest Name', 'Choice'])

    grid_coordinate = utils.grid_for_sheet(sheet_id)
    request_payload = utils.payload_for_file(raw_results_df, grid_coordinate)
    # requests_payload.append(request_payload)

    filtered_precinct_df = raw_results_df.copy()
    precinct_blacklist = ['TRANS', 'ONE', 'OS', 'CURB', 'PROVI', 'ABSEN']
    for item in precinct_blacklist:
        filtered_precinct_df = filtered_precinct_df[~filtered_precinct_df['Precinct'].str.contains(
            item)]
    grouped = filtered_precinct_df.groupby(
        ['Contest Name', 'Precinct']).sum().reset_index()
    grouped = grouped.dropna()
    precinct_counts = grouped.groupby('Contest Name').count()
    precinct_reported_counts = grouped[grouped['Total Votes'] >
                                       0].groupby('Contest Name').count()

    precinct_df = precinct_counts.join(
        precinct_reported_counts, rsuffix='reported')
    precinct_df = pd.DataFrame(
        {'precincts_reported_perc': precinct_df['Precinctreported'] / precinct_df['Precinct']}, index=precinct_df.index)

    sheet_id = 1496596366
    url = f'https://er.ncsbe.gov/enr/{election_date}/data/results_0.txt'
    print(url)
    resp = requests.get(url)
    candidate_df = pd.read_json(resp.content)
    candidate_df = candidate_df.drop(['cid', 'vfr', 'gid', 'lid', 'dtx', 'prt',
                                      'ptl', 'col', 'ogl', 'ref'], axis=1)
    candidate_df = candidate_df.rename(columns={'cnm': 'Race', 'bnm': 'Candidate',
                                                'pty': 'Party', 'vct': 'Total Votes', 'pct': 'Percent of Vote',
                                                'evc': 'Election Day Vote Count', 'avc': 'Absentee Vote Count', 'ovc': 'One-Stop Vote Count', 'pvc': 'Provisional Vote Count'})
    filtered_df = utils.filter_df(candidate_df)
    filtered_df = filtered_df.sort_values(['Race', 'Candidate'])
    grid_coordinate = utils.grid_for_sheet(sheet_id)
    request_payload = utils.payload_for_file(filtered_df, grid_coordinate)
    # requests_payload.append(request_payload)

    filtered_df = filtered_df.loc[filtered_df.groupby(
        ['Race', 'Candidate'])['Total Votes'].idxmax().dropna()]
    joined = utils.build_joined_df(filtered_df, precinct_df)

    sheet_id = 2103006474
    grid_coordinate = utils.grid_for_sheet(sheet_id)
    request_payload = utils.payload_for_file(joined, grid_coordinate)
    requests_payload.append(request_payload)

    update_payload = {'requests': requests_payload}

    sheet = service.spreadsheets()
    request = service.spreadsheets().batchUpdate(
        spreadsheetId=SPREADSHEET_ID, body=update_payload)
    response = request.execute()