def perform_vectorization(): csv = data_helper.read_csv(clean_text_directory + filename) corpus = csv[integra_index][:size] counts, vocab = vectorization.create_bag_of_words(corpus) print_functions.print_examples(corpus, counts) print_functions.print_vocabulary(vocab) data_features = vectorization.extract_tfidf(counts) df_data_features = pd.DataFrame(data_features, columns=vocab) columns_to_keep = [] termos_interesse = get_termos_interesse() print "Total Columns: " + str(len(df_data_features.columns)) for column in df_data_features.columns: if np.mean(df_data_features[column]) > threshold or column in termos_interesse: columns_to_keep.append(column) df_data_features = df_data_features[columns_to_keep] print "Columns to Keep: " + str(len(columns_to_keep)) data = pd.DataFrame(csv) data = data[data.columns.values[:-1]] new_columns = ["interesse"] # , "exclusao", "diario", "tipo_ato"] original_columns = data.columns.values for i in range(0, len(new_columns)): data[new_columns[i]] = data[original_columns[i]] data = data[new_columns] new_data = data.join(df_data_features) data_helper.save_file(new_data, features_directory, filename)
def __init__(self, data_path, estimators = 100): self.result = 0 self.train_cols = [] self.csv = data_helper.read_csv(data_path) self.forest = None self.mean_values = {} self.std_values = {} self.estimators = estimators self.train_size = len(self.csv) / 2
def print_stats(source_file, columns): csv_features = dh.read_csv(source_file) size_entries = len(csv_features) entries = csv_features.groupby(columns) for name, group in entries: print "Group Name: " + str(name) entry = len(group) print entry / float(size_entries)
def clean_text(publicacao="", remove_excluidas=True): csv = data_helper.read_csv(raw_directory + filename) # Filter the "publicacao/diario" if publicacao != "": criterion = csv[diario_index].map(lambda x: x.startswith(publicacao)) # or x.startswith('doe-sp')) csv = csv[criterion] # Filter the "regras de exclusao" if remove_excluidas: csv = csv[csv[regra_exclusao_index] == 0] # Slice the data to clean the text data = csv[:][:size] clean_description_list = data_helper.clean_list(data) data[integra_index] = clean_description_list data_helper.save_file(data, clean_text_directory, filename)