def random_word_tuple(input_string): input_list = cleanup.clean_text(input_string) tuples_histogram = histogram.tuples_histogram(input_list) index = random.randrange(len(input_list)) value = 0 hist_ind = 0 while value <= index and hist_ind < len(tuples_histogram): value += tuples_histogram[hist_ind][1] hist_ind += 1 return tuples_histogram[hist_ind - 1][0]
def load_sentqs_tweets(): if os.path.isfile("data/sentqs_preprocessed.npz"): loaded_data = np.load("data/sentqs_preprocessed.npz") return loaded_data['cleaned_tweets'],loaded_data["tweets"], loaded_data['y'],loaded_data['sentiment'],loaded_data["source_idx"],loaded_data["target_idx"] else: hashtags = ['ADBE', 'GOOGL', 'AMZN', 'AAPL', 'ADSK', 'BKNG', 'EXPE', 'INTC', 'MSFT', 'NFLX', 'NVDA', 'PYPL', 'SBUX', 'TSLA', 'XEL', 'positive', 'bad', 'sad'] # Loading and preprocessing of tweets df = pd.read_csv("Tweets.csv") sentiment = pd.to_numeric(df.iloc[:, -1], errors="raise", downcast="float") labels,tweets,sentiment = seperate_tweets(df.iloc[:, 1],hashtags,sentiment) cleaned_tweets = cleanup.clean_text(tweets) y = preprocessing.LabelEncoder().fit_transform(labels) source_idx,target_idx = create_domain_adaptation_index(tweets,labels,sentiment) np.savez_compressed("data/sentqs_preprocessed.npz",tweets=tweets, cleaned_tweets=cleaned_tweets, y=y,sentiment=sentiment,source_idx=source_idx,target_idx=target_idx) return cleaned_tweets,tweets, y,sentiment,source_idx,target_idx
def main_preprocessing(): hashtags = [ 'ADBE', 'GOOGL', 'AMZN', 'AAPL', 'ADSK', 'BKNG', 'EXPE', 'INTC', 'MSFT', 'NFLX', 'NVDA', 'PYPL', 'SBUX', 'TSLA', 'XEL' ] # Loading and preprocessing of tweets df = pd.read_csv("Tweets.csv") labels, tweets = seperate_tweets(df.iloc[:, 1], hashtags) cleaned_tweets = cleanup.clean_text(tweets) y = preprocessing.LabelEncoder().fit_transform(labels) # # # Get some statistics of the dataset describe_dataset(cleaned_tweets, labels) # # # Create feature representation: TFIDF Variants and skipgram embedding with 1000 dimension and negative sampling create_representation(cleaned_tweets, y) # Plot eigenspectrum of embeddings X = np.load("data/nsdqs_skipgram_embedding.npy") plot_eigenspectrum(X) # Plot representation of 2 dimensional tsne embedding plot_tsne(X, labels)
unique_list.append(item) for item in unique_list: occurance = iterable.count(item) dictionary[item] = [str(item), occurance] return dictionary def frequency(histogram_in, item): ''' This function takes a item and histogram argument and returns the number of times that item appears in a text. ''' if item in histogram_in: return histogram_in[item][1] else: return 0 def all_items(histogram_in): '''Calculates the total number of items in the text.''' total = 0 for item in histogram_in: total += histogram_in[item][1] return total if __name__ == "__main__": cleaned_text = clean_text('the_book.txt') print(histogram(cleaned_text))
from flask import Flask, render_template, redirect, request, url_for from cleanup import read_in_data, clean_text, add_stop, create_sentence from random import randint, choice from markov_chain import higher_markov, random_walk app = Flask(__name__) source_text = 'static/tumblr_wordz.txt' words = read_in_data(source_text) cleaned_text = clean_text(words) stripped_words = add_stop(cleaned_text) markov = higher_markov(stripped_words) @app.route('/') def index(): # Check for user input, otherwise, print sentence w/ 6 words usr_input = request.args.get('num') num_of_words = int(usr_input) if (usr_input != None and usr_input != "") else 6 # Create sentence with specified num_of_words output = random_walk(markov, num_of_words) sentence = create_sentence(output) return render_template('index.html', sentence=sentence) if __name__ == '__main__':
def main(): corpus = clean_text('book_1.txt') print(random_walk_first(corpus))
cume = 0 for word in histogram_in: cume += histogram_in[word] if (cume > value): return word def sample_list_O_stuff(histogram_in): # stuff means tuples or lists cap = 0 i = 0 while i < len(histogram_in): cap += histogram_in[i][1] i += 1 value = random.randint(0, cap) cume = 0 index = 0 while index < len(histogram_in): cume += histogram_in[index][1] if cume > value: return histogram_in[index][0] index += 1 if __name__ == "__main__": text = clean_text('book_1.txt') hist = Dictogram(text) list_hist = Listogram(text) print(sample_list_O_stuff(list_hist)) print(dictionary_sample(hist))
def __init__(self): self.text = cleanup.clean_text() self.token = words_list.token(self.text) self.word_dict = {}
import cleanup def token(text): """ Input: string that is clean text Process: converts string to list of words Output: List of words """ word_list = list(text.split()) return word_list if __name__ == "__main__": text = cleanup.clean_text() token(text) word_list = token(text) # print(word_list)