예제 #1
0
def random_word_tuple(input_string):
    input_list = cleanup.clean_text(input_string)
    tuples_histogram = histogram.tuples_histogram(input_list)
    index = random.randrange(len(input_list))
    value = 0
    hist_ind = 0
    while value <= index and hist_ind < len(tuples_histogram):
        value += tuples_histogram[hist_ind][1]
        hist_ind += 1
    return tuples_histogram[hist_ind - 1][0]
def load_sentqs_tweets():
    if os.path.isfile("data/sentqs_preprocessed.npz"):
        loaded_data = np.load("data/sentqs_preprocessed.npz")
        return loaded_data['cleaned_tweets'],loaded_data["tweets"], loaded_data['y'],loaded_data['sentiment'],loaded_data["source_idx"],loaded_data["target_idx"]
    else:
        hashtags = ['ADBE', 'GOOGL', 'AMZN', 'AAPL', 'ADSK', 'BKNG', 'EXPE', 'INTC', 'MSFT', 'NFLX', 'NVDA', 'PYPL', 'SBUX',
         'TSLA', 'XEL', 'positive', 'bad', 'sad']

        # Loading and preprocessing of tweets
        df = pd.read_csv("Tweets.csv")
        sentiment = pd.to_numeric(df.iloc[:, -1], errors="raise", downcast="float")
        labels,tweets,sentiment = seperate_tweets(df.iloc[:, 1],hashtags,sentiment)
        cleaned_tweets = cleanup.clean_text(tweets)

        y = preprocessing.LabelEncoder().fit_transform(labels)

        source_idx,target_idx = create_domain_adaptation_index(tweets,labels,sentiment)
        np.savez_compressed("data/sentqs_preprocessed.npz",tweets=tweets, cleaned_tweets=cleaned_tweets, y=y,sentiment=sentiment,source_idx=source_idx,target_idx=target_idx)
        return cleaned_tweets,tweets, y,sentiment,source_idx,target_idx
def main_preprocessing():
    hashtags = [
        'ADBE', 'GOOGL', 'AMZN', 'AAPL', 'ADSK', 'BKNG', 'EXPE', 'INTC',
        'MSFT', 'NFLX', 'NVDA', 'PYPL', 'SBUX', 'TSLA', 'XEL'
    ]

    # Loading and preprocessing of tweets
    df = pd.read_csv("Tweets.csv")
    labels, tweets = seperate_tweets(df.iloc[:, 1], hashtags)
    cleaned_tweets = cleanup.clean_text(tweets)
    y = preprocessing.LabelEncoder().fit_transform(labels)
    #
    # # Get some statistics of the dataset
    describe_dataset(cleaned_tweets, labels)
    #
    # # Create feature representation: TFIDF Variants and skipgram embedding with 1000 dimension and negative sampling
    create_representation(cleaned_tweets, y)

    # Plot eigenspectrum of embeddings
    X = np.load("data/nsdqs_skipgram_embedding.npy")
    plot_eigenspectrum(X)

    # Plot representation of 2 dimensional tsne embedding
    plot_tsne(X, labels)
예제 #4
0
            unique_list.append(item)

    for item in unique_list:
        occurance = iterable.count(item)
        dictionary[item] = [str(item), occurance]

    return dictionary


def frequency(histogram_in, item):
    '''
    This function takes a item and histogram argument and
    returns the number of times that item appears in a text.
    '''
    if item in histogram_in:
        return histogram_in[item][1]
    else:
        return 0


def all_items(histogram_in):
    '''Calculates the total number of items in the text.'''
    total = 0
    for item in histogram_in:
        total += histogram_in[item][1]
    return total


if __name__ == "__main__":
    cleaned_text = clean_text('the_book.txt')
    print(histogram(cleaned_text))
예제 #5
0
from flask import Flask, render_template, redirect, request, url_for
from cleanup import read_in_data, clean_text, add_stop, create_sentence
from random import randint, choice
from markov_chain import higher_markov, random_walk

app = Flask(__name__)

source_text = 'static/tumblr_wordz.txt'

words = read_in_data(source_text)
cleaned_text = clean_text(words)
stripped_words = add_stop(cleaned_text)
markov = higher_markov(stripped_words)


@app.route('/')
def index():

    # Check for user input, otherwise, print sentence w/ 6 words
    usr_input = request.args.get('num')
    num_of_words = int(usr_input) if (usr_input != None
                                      and usr_input != "") else 6

    # Create sentence with specified num_of_words
    output = random_walk(markov, num_of_words)
    sentence = create_sentence(output)

    return render_template('index.html', sentence=sentence)


if __name__ == '__main__':
예제 #6
0
def main():
    corpus = clean_text('book_1.txt')
    print(random_walk_first(corpus))
예제 #7
0
    cume = 0
    for word in histogram_in:
        cume += histogram_in[word]
        if (cume > value):
            return word


def sample_list_O_stuff(histogram_in):  # stuff means tuples or lists
    cap = 0
    i = 0
    while i < len(histogram_in):
        cap += histogram_in[i][1]
        i += 1

    value = random.randint(0, cap)

    cume = 0
    index = 0
    while index < len(histogram_in):
        cume += histogram_in[index][1]
        if cume > value:
            return histogram_in[index][0]
        index += 1


if __name__ == "__main__":
    text = clean_text('book_1.txt')
    hist = Dictogram(text)
    list_hist = Listogram(text)
    print(sample_list_O_stuff(list_hist))
    print(dictionary_sample(hist))
예제 #8
0
 def __init__(self):
     self.text = cleanup.clean_text()
     self.token = words_list.token(self.text)
     self.word_dict = {}
예제 #9
0
import cleanup


def token(text):
    """ Input: string that is clean text
        Process: converts string to list of words
        Output: List of words
    """
    word_list = list(text.split())
    return word_list


if __name__ == "__main__":
    text = cleanup.clean_text()
    token(text)
    word_list = token(text)
    # print(word_list)