Ejemplo n.º 1
0
def perform_vectorization():
    csv = data_helper.read_csv(clean_text_directory + filename)
    corpus = csv[integra_index][:size]
    counts, vocab = vectorization.create_bag_of_words(corpus)
    print_functions.print_examples(corpus, counts)
    print_functions.print_vocabulary(vocab)

    data_features = vectorization.extract_tfidf(counts)
    df_data_features = pd.DataFrame(data_features, columns=vocab)
    columns_to_keep = []
    termos_interesse = get_termos_interesse()

    print "Total Columns: " + str(len(df_data_features.columns))
    for column in df_data_features.columns:
        if np.mean(df_data_features[column]) > threshold or column in termos_interesse:
            columns_to_keep.append(column)
    df_data_features = df_data_features[columns_to_keep]
    print "Columns to Keep: " + str(len(columns_to_keep))

    data = pd.DataFrame(csv)
    data = data[data.columns.values[:-1]]

    new_columns = ["interesse"]  # , "exclusao", "diario", "tipo_ato"]
    original_columns = data.columns.values
    for i in range(0, len(new_columns)):
        data[new_columns[i]] = data[original_columns[i]]

    data = data[new_columns]
    new_data = data.join(df_data_features)
    data_helper.save_file(new_data, features_directory, filename)
Ejemplo n.º 2
0
roman_numbers = get_roman_numbers()
state_initials = bl.get_state_initials()

state_names = bl.get_state_names()
state_names = [state.split() for state in state_names]
state_names = list(itertools.chain.from_iterable(state_names))

state_capitals = bl.get_state_capitals()
state_capitals = [capital.split() for capital in state_capitals]
state_capitals = list(itertools.chain.from_iterable(state_capitals))

months = get_months()
letters = get_letters()
law_words = get_law_words()
law_words = [stemmer.stem(word) for word in law_words]
termos_interesse = get_termos_interesse()
portuguese_names = get_portuguese_names()

def clean_text( raw_text ):
    # Function to convert a raw text to a string of words
    # The input is a single string (a raw text), and
    # the output is a single string (a preprocessed text)
    #
    # 1. Includes a space before "<" to avoid joining two words together
    pre_text = raw_text.replace("<", " <")
    #
    # 2. Some states (such as Acre) uses "_" to separate a line
    pre_text = pre_text.replace("_", " ")
    #
    # 3. Remove HTML
    review_text = BeautifulSoup(pre_text).get_text()