def list_hist(source): ''' List of lists histogram. [['hello', 1], ['you', 3], ['sir', 4]] Takes text. Stores each item in text, compares each item to the rest of the words in text and keeps a running total. Used list account for no repeats. ''' histo = [] used = [] text = clean(source) # print(text) for word in text: counter = 0 if word in used: continue used.append(word) for word2 in text: if word == word2: counter += 1 instance = [word, counter] histo.append(instance) # print(histo) return histo
def tuple_hist(source): ''' Fastest - tuples are immutable. List of tuples: [('hello', 3), ('what', 4)] Takes text. Stores each item in text, compares each item to the rest of the words in text and keeps a running total. Used list account for no repeats. ''' histo = [] used = [] text = clean(source) # print(text) for word in text: # see if we've used the word before counter = 0 if word in used: continue used.append(word) for word2 in text: if word == word2: counter += 1 instance = (word, counter) histo.append(instance) # print(histo) return histo
def hello_world(): my_file = open("./words.txt", "r") lines = clean(my_file) my_histogram = list_hist(lines) word = prob_sample(my_histogram) return word
def preprocess(df, mode): # Clean the text df['question1'] = df.question1.map(lambda x: ct.clean(x)) df['question2'] = df.question2.map(lambda x: ct.clean(x)) # Prepare the data for the model print("Preparing data for model...") # While training, create Tokenizer object and also return labels if mode == 'train': tokenizer = tokenize(df) df['question1'] = tokenizer.texts_to_sequences(df.question1) df['question2'] = tokenizer.texts_to_sequences(df.question2) question1 = np.array( list( tf.keras.preprocessing.sequence.pad_sequences(df.question1, maxlen=maxlen))) question2 = np.array( list( tf.keras.preprocessing.sequence.pad_sequences(df.question2, maxlen=maxlen))) labels = np.array(list(df.is_duplicate)) return question1, question2, labels, tokenizer # While predicting, load existing Tokenizer object if mode == 'predict': with open('../checkpoints/tokenizer.pickle', 'rb') as handle: tokenizer = pickle.load(handle) df['question1'] = tokenizer.texts_to_sequences(df.question1) df['question2'] = tokenizer.texts_to_sequences(df.question2) question1 = np.array( list( tf.keras.preprocessing.sequence.pad_sequences(df.question1, maxlen=maxlen))) question2 = np.array( list( tf.keras.preprocessing.sequence.pad_sequences(df.question2, maxlen=maxlen))) return question1, question2
def get_all_tweets(screen_name): # authorize twitter, initialize tweepy auth = tweepy.OAuthHandler(consumer_key, consumer_secret) auth.set_access_token(access_token, access_token_secret) api = tweepy.API(auth) # initialize a list to hold all the tweepy Tweets all_tweets = [] # make initial request for most recent tweets (200 is the maximum allowed count) new_tweets = api.user_timeline(screen_name=screen_name, count=200, tweet_mode='extended') # save most recent tweets all_tweets.extend(new_tweets) # save the id of the oldest tweet less one oldest = all_tweets[-1].id - 1 # keep grabbing tweets until there are no tweets left to grab while len(new_tweets) > 750: print("getting tweets before %s" % (oldest)) # all subsequent requests use the max_id param to prevent duplicates new_tweets = api.user_timeline(screen_name=screen_name, count=200, max_id=oldest) # save most recent tweets all_tweets.extend(new_tweets) # update the id of the oldest tweet less one oldest = all_tweets[-1].id - 1 print("...%s tweets downloaded so far" % (len(all_tweets))) clean = open(f"/tmp/{screen_name}-clean.txt", "a") bad_words = get_bad_phrases(bucket, key) final_list = "" for tweet in all_tweets: if hasattr(tweet, "full_text"): raw = tweet.full_text else: raw = tweet.text for phrase in bad_words: if phrase in raw: raw = "" cleaned = clean_text.clean(raw) if cleaned != "": final_list += (cleaned + "\n") clean.write(final_list) return final_list
def list_hist(source): text = clean(source) histogram = [] for word in text: instance = [word, 0] for word2 in text: if word == word2: instance[1] += 1 if instance not in histogram: histogram.append(instance) # print(histogram) return histogram
def counts_list(source): histo = [] instances = [] used = [] text = clean(source) # print(text) for word in text: # check if the word has already been accounted if word in used: continue counter = 0 used.append(word) # for each word in the text if it matches a word in the same text, # we have an instance of that word - so increase counter by 1 for word2 in text: if word == word2: counter += 1 # we know the word and we have the occurances stored in counter. # create a list instance object with the word and its occurances # and append it to the list of word instances. instance = [word, counter] instances.append(instance) used_nums = [] for item in instances: # check if the word frequency has been accounted for before if item[1] in used_nums: continue used_nums.append(item[1]) membs = [] new_instance = ( item[1], membs ) # this is what an instance of our histogram looks like # for one item in our instances we check if the frequency matches # any other frequencies in the instances list. if it does we add those to members list for item2 in instances: if item[1] == item2[1]: # print(item2[0]) membs.append(item2[0]) histo.append(new_instance) # print(histo) return histo
def dict_hist(source): '''opens a text file creates empty dictionary appends null instance and adds 1 or adds 1 to the existing instance ''' histogram = {} text = clean(source) # print(text) for word in text: if word not in histogram: histogram[word] = 0 histogram[word] += 1 # print(histogram) return histogram
def stochastic_sample(source): ''' Histogram -> percentages -> random item''' hist = invert_hist(source) percentages = {} # print(hist) text = clean(source) # with open(text) as file: # word_list = (file.read().split(" ")) length = len(text) dart = random.randint(0, 100) # index is not length # print(length) # print(word_list) # print("dart: {}".format(dart)) # calculate total for percent total = 0 for item in hist: total += (len(item[1]) * item[0]) # print("len item: {}".format(len(item[1]))) # number of words per index # individual percentages added to percent dict for item in hist: for word in item[1]: word_percentage = item[0] / total percentages[word] = word_percentage * 100 # finding the place the dart hit. each word has percentage assigned, we add them until we reach the dart num = 0 target = None for word in percentages: num += percentages[word] target = word # print("trial - num: {}, target: {}".format(num, target)) #prints each trial through the hist if num > dart: # we add til we break it. if we cross the line, we return break # print("num: {}, target: {}".format(num, target)) # print('') # print(hist) # print('') # print(percentages) # print(target) # print(str(target)) return str(target)
def dict_hist(source): ''' Dictionary key value pairs {'hello':1, 'sir':2, 'how':5} Takes text. Stores each item in text, compares each item to the rest of the words in text and keeps a running total. Used list account for no repeats.''' histo_dict = {} text = clean(source) # print(text) for word in text: if word in histo_dict: histo_dict[word] += 1 else: histo_dict[word] = 1 # print(histo_dict) return histo_dict
def tuple_hist(source): text = clean(source) histogram = [] # text = separate(text) cache = [] for word in text: if word not in cache: cache.append(word) num_occur = 0 for word2 in text: if word2 == word: num_occur += 1 instance = (word, num_occur) tup = tuple(instance) histogram.append(tup) return histogram
def markov(source): text = clean(source) chain = {} current = None prev = None for word in text: current = word if not chain: chain[current] = None else: chain[prev] = current prev = current return chain
x_text = np.array(twitter_df.tokens) x_text = labelizeTweets(x_text, 'ACTUAL') tweet_vecs = np.concatenate( [buildWordVector(z, n_dim) for z in map(lambda x: x.words, x_text)]) tweet_vecs = scale(tweet_vecs) print("word vectors are created") df = capture_sentiment(twitter_df, tweet_vecs) df_pos = df[df['Sentiment'] == 1] df_neg = df[df['Sentiment'] == 0] documents = list(df_pos['SentimentText']) doc_clean = [clean(doc).split() for doc in documents] print("documents are cleaned") # Creating the term dictionary of our corpus, where every unique term is assigned an index. dictionary = corpora.Dictionary(doc_clean) dictionary.filter_extremes() # Converting list of documents (corpus) into Document Term Matrix using dictionary prepared above. doc_term_matrix = [dictionary.doc2bow(doc) for doc in doc_clean] # Creating the object for LDA model using gensim library Lda = gensim.models.ldamodel.LdaModel # Running and Training LDA model on the document term matrix. print("training LDA started") ldamodel = Lda(doc_term_matrix,
def generate(tweets): user = random.randint(0, len(tweets) - 1) bad_phrases = get_bad_phrases(bucket, key) tweet_pool = [] if random.randint(0, 4) is 0: tweet_pool = tweets[user] else: for person in tweets: tweet_pool += person random.shuffle(tweet_pool) tweet_sample_size = 30 bigram_dict, trigram_dict, firstWords = generate_ngrams.generate_ngrams( tweet_pool[0:tweet_sample_size]) while True: written = "" written += firstWords[random.randint(0, len(firstWords) - 1)] while "RETRYRETRY" not in written.upper( ) and "terminate" not in written.lower(): generated = generate_next_word(written, bigram_dict, trigram_dict) written += " " + generated if ("RETRYRETRY" not in written.upper()): valid = True final = re.sub(" terminate| Terminate| TERMINATE", "", written) final = final.rstrip() if len(final) > 2: final = final[0].capitalize() + final[1:] if final.count(" ") > 1 and len(final) <= 120: clean_sub = clean_text.clean( final[int(.20 * len(final)):int(len(final) * .80)]) for tweet in tweet_pool[0:tweet_sample_size]: base_words = tweet.upper().split(" ") base_words = list(dict.fromkeys(base_words)) base_words.sort() generated_words = final.upper().split(" ") generated_words = list(dict.fromkeys(generated_words)) generated_words.sort() if base_words == generated_words: valid = False if clean_text.clean(clean_sub).upper() in clean_text.clean( tweet).upper(): valid = False # print ("\nnot tweeting: ",clean_sub,"\nbecause it is in: ",tweet,"\n") for phrase in bad_phrases: if phrase.upper() in tweet.upper(): valid = False else: valid = False if final.count("\"") != 2 or final.count("\"") != 0: final = re.sub("\"", "", final) if valid: return final
name = tree.xpath('//div/strong/text()') if len(name) != 1: continue else: name = name[0] rating = tree.xpath('//span[@class="gen-user-ratings"]/text()') rating = [r for r in rating if r != ' '] if len(rating) < 1: continue else: vendor_rating = clean(rating[0].replace('~', '.').replace( '/', '.').replace(' deals', '').replace('.5, ', ' ')) test = tree.xpath( '//div[@class="embedded-feedback-list"]/table/tr/td//text()') test = [t for t in test if t.replace(' ', '') != ''] vals_re = re.compile('[0-5]/[0-5]') days_ago_re = re.compile('[0-9]+ days ago') if len(test) > 0: rating_inds = [ ind for ind, t in enumerate(test) if vals_re.match(t) ] days_ago_inds = [ ind for ind, t in enumerate(test) if days_ago_re.match(t) ]
import random import clean_text with open('./opspam_reviews.txt', 'r') as f: revs = f.readlines() with open('./opspam_labels.txt', 'r') as f: labs = f.readlines() labs = [int(l.strip()) for l in labs] revs = [clean_text.clean(r) for r in revs] decep = [r for r, l in zip(revs, labs) if l == 1] nondecep = [r for r, l in zip(revs, labs) if l == 0] nondecep = list(set(list(nondecep))) decep_idx = random.sample(list(range(len(decep))), len(decep)) nondecep_idx = random.sample(list(range(len(nondecep))), len(nondecep)) decep = [decep[i] for i in decep_idx] nondecep = [nondecep[i] for i in nondecep_idx] train_decep = decep[0:640] test_decep = decep[640:] train_nondecep = nondecep[0:636] test_nondecep = nondecep[636:] train = train_decep + train_nondecep train_labs = [1] * len(train_decep) + [0] * len(train_nondecep) test = test_decep + test_nondecep test_labs = [1] * len(test_decep) + [0] * len(test_nondecep)
def __init__(self, corpus): self.corpus = clean(corpus) self.states = {} self.chain()
name = tree.xpath('//div/strong/text()') if len(name) != 1: continue else: name = name[0] rating = tree.xpath('//span[@class="gen-user-ratings"]/text()') rating = [ r for r in rating if r != ' '] if len(rating) < 1: continue else: vendor_rating = clean(rating[0].replace('~', '.').replace('/', '.').replace(' deals', '').replace('.5, ', ' ')) test = tree.xpath('//div[@class="embedded-feedback-list"]/table/tr/td//text()') test = [t for t in test if t.replace(' ', '') != ''] vals_re = re.compile('[0-5]/[0-5]') days_ago_re = re.compile('[0-9]+ days ago') if len(test) > 0: rating_inds = [ind for ind, t in enumerate(test) if vals_re.match(t)] days_ago_inds = [ind for ind, t in enumerate(test) if days_ago_re.match(t)] else: continue # The only reviews which are really reviews are those which # are followed by a 'left X days ago' thing