Example #1
0
def perform_vectorization():
    csv = data_helper.read_csv(clean_text_directory + filename)
    corpus = csv[integra_index][:size]
    counts, vocab = vectorization.create_bag_of_words(corpus)
    print_functions.print_examples(corpus, counts)
    print_functions.print_vocabulary(vocab)

    data_features = vectorization.extract_tfidf(counts)
    df_data_features = pd.DataFrame(data_features, columns=vocab)
    columns_to_keep = []
    termos_interesse = get_termos_interesse()

    print "Total Columns: " + str(len(df_data_features.columns))
    for column in df_data_features.columns:
        if np.mean(df_data_features[column]) > threshold or column in termos_interesse:
            columns_to_keep.append(column)
    df_data_features = df_data_features[columns_to_keep]
    print "Columns to Keep: " + str(len(columns_to_keep))

    data = pd.DataFrame(csv)
    data = data[data.columns.values[:-1]]

    new_columns = ["interesse"]  # , "exclusao", "diario", "tipo_ato"]
    original_columns = data.columns.values
    for i in range(0, len(new_columns)):
        data[new_columns[i]] = data[original_columns[i]]

    data = data[new_columns]
    new_data = data.join(df_data_features)
    data_helper.save_file(new_data, features_directory, filename)
    def __init__(self, data_path, estimators = 100):
        self.result = 0
	self.train_cols = []
	self.csv = data_helper.read_csv(data_path)
        self.forest = None
        self.mean_values = {}
        self.std_values = {}
        self.estimators = estimators
        self.train_size = len(self.csv) / 2
def print_stats(source_file, columns):
    csv_features = dh.read_csv(source_file)

    size_entries = len(csv_features)
    entries = csv_features.groupby(columns)

    for name, group in entries:
        print "Group Name: " + str(name)
        entry = len(group)
        print entry / float(size_entries)
Example #4
0
def clean_text(publicacao="", remove_excluidas=True):
    csv = data_helper.read_csv(raw_directory + filename)

    # Filter the "publicacao/diario"
    if publicacao != "":
        criterion = csv[diario_index].map(lambda x: x.startswith(publicacao))  # or x.startswith('doe-sp'))
        csv = csv[criterion]

    # Filter the "regras de exclusao"
    if remove_excluidas:
        csv = csv[csv[regra_exclusao_index] == 0]

    # Slice the data to clean the text
    data = csv[:][:size]

    clean_description_list = data_helper.clean_list(data)
    data[integra_index] = clean_description_list

    data_helper.save_file(data, clean_text_directory, filename)