def import_set():
    with open(get_file_path('interim\\training.pkl'), 'rb') as f:
        training = pickle.load(f)

    with open(get_file_path('interim\\testing.pkl'), 'rb') as f:
        testing = pickle.load(f)

    return training, testing
Esempio n. 2
0
def import_neutral_negative():
    neg = ''
    neu = ''
    with open(get_file_path("negative.txt"), 'r') as f:
        neg = f.read()
    with open(get_file_path("neutral.txt"), 'r') as f:
        neu = f.read()
    return neg.split(), neu.split()
Esempio n. 3
0
def export_to_dict():
    positive = (open(get_file_path("positive-words.txt"), "r").read())
    negative = (open(get_file_path("negative-words.txt"), "r").read())
    words = dict()
    for line in positive.split():
        words[line] = 1
    for line in negative.split():
        words[line] = -1
    f = open("words.py", "w")
    f.write("WORD_SENTIMENT=" + str(words))
    f.close()
def import_dataset(dataset_name):
    reviews_list = list()
    with open(get_file_path("raw\\" + dataset_name + ".json"), encoding="utf8") as json_file:
        for line in json_file:
            sample = json.loads(line)
            reviews_list.append(sample)

    return reviews_list
def create_lexicon(corpus, name):
    positive = (open(get_file_path("new_positive.txt"), "r").read())
    negative = (open(get_file_path("new_negative.txt"), "r").read())
    print("Cooccurrence matrix")
    d = cooccurrence_matrix(corpus)
    print("Sorting vocab")
    vocab = get_sorted_vocab(d)
    print("Cosine matrix")
    cm = cosine_similarity_matrix(vocab, d)
    print(datetime.now())
    print("Propagation ")
    prop = graph_propagation(cm, vocab, positive.split(), negative.split(), 2)
    print(datetime.now())
    final = list()
    for key, val in sorted(prop.items(), key=itemgetter(1), reverse=True):
        final.append((key, val))
    print(datetime.now())
    print("Saving")
    save_lexicon_results(final, "lexicon_results_grupo_amigo")
    print(datetime.now())
def prepare_dataframe():
    with open(get_file_path("dataframe.pkl"), 'rb') as f:
        df = pickle.load(f)
    new_df = pd.DataFrame(index=[
        'reviews_Automotive', 'reviews_Cell_Phones_and_Accessories',
        'reviews_Video_Games', 'reviews_Movies_and_TV'
    ],
                          columns=[0, 1, 2, 3, 4, 5, 6, 7, 8, 9])
    for file in get_file_names():
        for i in range(0, 10):
            df_temp = df[df.categories == file]
            m = df_temp[i].mean()
            new_df.at[file, i] = m

    return new_df
def plot_explained_variance(file_name):
    file = open(get_file_path(file_name), "r")
    total = 0.0
    data = list()
    data_y = list()
    i = 1
    for num in file.read().split():
        total = float(num) + total
        data.append(total)
        data_y.append(i)
        i += 1
    plt.plot(data, data_y)
    plt.ylabel('Number of Components')
    plt.xlabel('Explained Variance')
    plt.title('Bag Of Nouns SVD Components Explained Variance')
    plt.show()
def import_and_divide():
    files = get_file_names()
    training = list()
    testing = list()
    for file in files:
        with open(get_file_path('interim\\sample_' + file + '.pkl'), 'rb') as f:
            lines = pickle.load(f)
            t = choice(lines, size=70000, replace=False)
            for l in tqdm(t):
                lines.remove(l)
                l['category'] = file
                training.append(l)
            for l in lines:
                l['category'] = file
                testing.append(l)


    export_training_testing(training, testing)
from features.explore import bag_of_words, tf_idf, bigrm, only_nouns
from src.data.import_dataset import import_cleaned_training_set
from visualization.visualize import display_features
from features.normalize import lemmatize, letters_only, lower_only, remove_contractions, remove_stopwords
from src.utils.utils import  get_file_path
from nltk.corpus import stopwords


print(type(stopwords.words('English')))
with open(get_file_path("stopwords.txt"), "w") as file:
    for word in stopwords.words('English'):
        file.write("%s\n" % word)
def export_to_txt(scores, name):
    with open(get_file_path(name + ".txt"), "w") as file:
        for t in scores:
            file.write(' '.join(str(s) for s in t) + '\n')
def import_cleaned_testing_set():
    with open(get_file_path('processed\\testing.pkl'), 'rb') as file:
        testing = pickle.load(file)

    return testing
def save_lexicon_results(results, name):
    with open(get_file_path(name + '.pkl'), 'wb') as f:
        pickle.dump(results, f, pickle.HIGHEST_PROTOCOL)
def export_nouns_adj_adv(data, filename):
    with open(get_file_path('processed\\' + filename + '.pkl'), 'wb') as f:
        pickle.dump(data, f, pickle.HIGHEST_PROTOCOL)
Esempio n. 14
0
def import_tagged_words(name):
    with open(get_file_path('processed\\tagged_words_' + name + '.pkl'),
              'rb') as file:
        return pickle.load(file)
Esempio n. 15
0
def read_pickle(folder, file):
    with open(get_file_path(file + '.pkl'), 'rb') as lines:
        return pickle.load(lines)
def export_training_testing(training, testing):
    with open(get_file_path("interim\\training.pkl"), "wb") as file:
        pickle.dump(training, file, pickle.HIGHEST_PROTOCOL)

    with open(get_file_path("interim\\testing.pkl"), "wb") as file:
        pickle.dump(testing, file, pickle.HIGHEST_PROTOCOL)
Esempio n. 17
0
def import_lexicon_set():
    with open(get_file_path('interim\\lexicon_dataset_smaller.pkl'),
              'rb') as f:
        return pickle.load(f)
def write_new_pickle(review_list, name):
    with open(get_file_path("interim\\" + name + ".pkl"), "wb") as f:
        pickle.dump(review_list, f)
def export_scores(scores):
    with open(get_file_path("scores.txt"), "w") as file:
        file.write(scores)
Esempio n. 20
0
def read_pickle_files(file):
    with open(get_file_path('raw\\' + file + '.pkl'), 'rb') as lines:
        return pickle.load(lines)
def export_comments(set_to_save):
    with open(get_file_path("processed\\comments.pkl"), "wb") as file:
        pickle.dump(set_to_save, file, pickle.HIGHEST_PROTOCOL)
def export_dataset(set_to_save, name):
    with open(get_file_path("processed\\" + name + ".pkl"), "wb") as file:
        pickle.dump(set_to_save, file, pickle.HIGHEST_PROTOCOL)
from features.normalize import lemmatize

from src.utils.utils import get_file_path


positive = (open(get_file_path("positive-words.txt"), "r").read())
negative = (open(get_file_path("negative-words.txt"), "r").read())
positive = positive.replace("\n", " ")
negative = negative.replace("\n", " ")
pos = lemmatize(positive)
neg = lemmatize(negative)

new_positive = open(get_file_path("positive_lemmatized.txt"), 'w')
pos = sorted(list(set(pos.split())))
for word in pos:
    new_positive.write("%s\n" % word)

new_negative = open(get_file_path("negative_lemmatized.txt"), "w")
neg = sorted(list(set(neg.split())))
for word in neg:
    new_negative.write("%s\n" % word)

Esempio n. 24
0
def get_stopwords():
    with open(get_file_path('stopwords.txt'), 'r') as f:
        return f.read()
Esempio n. 25
0
def get_file_path_test():
    file_path = utils.get_file_path(__file__)
    print(file_path)
def export_sampled_datasets(train, file):
    with open(get_file_path("raw\\"+file + ".pkl"), "wb") as f:
        dump(train, f)