def test_solve(): solver = Solver() wordlist = WordList().words dealt = 'JREZQXW' expected = [ 'ER', 'EX', 'JEW', 'RE', 'REW', 'REX', 'REZ', 'WE', 'WEX', 'ZEX' ] assert solver.solve(dealt) == expected
def test_init(self): char_mat = CharMat() word_list = WordList() game_status = GameStatus(char_mat, word_list) self.assertEqual(game_status.word_list, word_list) self.assertEqual(game_status.char_mat, char_mat) self.assertEqual(game_status.score, 0) self.assertEqual(game_status.time, 60) self.assertEqual(game_status.times_up, False) self.assertEqual(game_status.success, False)
def __init__(self, grid, word_list_filename, print_func=print): """Inits WordSearch from an grid of letters and from input word-list text file. Args: grid (list) - letters grid as list of n m-letters str word_list_filename (str) - full path input word-list text file. print_func (func(x)) - [optional] print word function """ self.set_grid(grid) self._word_list = WordList(word_list_filename) self._print_func = print_func self._no_found_words = 0
def create_word_list(file_name): """Creates WordList object from input word-list text file. Args: file_name (str) - full path input word-list UTF-8 text file each word in single line. Returns: The created WordList object """ print("creating word-list from " + str(file_name) + " file... ") word_list = WordList("word.list") print("[DONE]") return word_list
def preprocess(data_path, is_testing, min_occurrences=5, cache_bow_output=None, cache_word2vec_output=None, duration=None): if duration: data = DataInitializer() data.initialize(data_path, is_testing, duration=duration) else: data = DataInitializer() data.initialize(data_path, is_testing) if os.path.isfile("data/BTC.csv"): prices_data = GetPricesData() prices_data.main() data = DataCleaning(data, is_testing) data.cleanup(DataCleaner(is_testing)) if is_testing: print("Testing data shape:", data.processed_data.shape) else: print("Training data shape:", data.processed_data.shape) data = Sentiments(data) data.sentiment_analysis_by_text() print("First five rows with sentiment: ", data.processed_data.head()) if is_testing: data.processed_data.to_csv("data/clean_test_with_sentiments.csv", sep=',', encoding='utf-8', index=False) # os.remove(data_path) else: data.processed_data.to_csv("data/clean_train_with_sentiments.csv", sep=',', encoding='utf-8', index=False) # os.remove(data_path) data = DataTokenize(data) data.tokenize() data.stem() data = WordList(data) data.build_wordlist(min_occurrences=min_occurrences) word2vec_data = data data = BagOfWords(data.processed_data, data.wordlist, is_testing) data.build_data_model() print("data model head: ", data.data_model.head(5)) """ Word 2 vec """ word2vec = Word2VecProvider() # REPLACE PATH TO THE FILE word2vec.load("../twitter/data/glove.twitter.27B.200d.txt") word2vec_data = RedditData(word2vec_data) word2vec_data.build_final_model(word2vec) word2vec_data_model = word2vec_data.data_model if "index" in word2vec_data_model.columns: word2vec_data_model.drop("index", axis=1, inplace=True) word2vec_data_model.dropna(axis=0, inplace=True) word2vec_data_model.reset_index(inplace=True) word2vec_data_model.index = word2vec_data_model['timestamp_ms'] print("final word2vec data model: \n", word2vec_data_model.head(), "\n") """ Tokenizing the data """ texts = [] sentiments = [] tokenized_data = pd.DataFrame() for text in data.processed_data["summary"]: texts.append(text) for sentiment in data.processed_data["sentiment"]: sentiments.append(sentiment) print("texts: ", texts[0:5]) tokenizer = Tokenizer(num_words=20000) tokenizer.fit_on_texts(texts) sequences = tokenizer.texts_to_sequences(texts) padded_sequences = pad_sequences(sequences, maxlen=200) print( "\n\n##################################################\npadded sequence head: \n", padded_sequences[0:5]) print( "\n####################################################\n padded sequence length \n", len(padded_sequences)) if not is_testing: data = Plotting(data) data.plot() if cache_bow_output is not None: data.data_model.to_csv(cache_bow_output, index=False, float_format="%.6f") word2vec_data_model.to_csv(cache_word2vec_output, index=False, float_format="%.6f") with open('sequences', 'wb') as fp: pickle.dump(padded_sequences, fp) with open('sentiments', 'wb') as fp: pickle.dump(sentiments, fp) return data.data_model, word2vec_data_model
ap.add_argument('-depth', required=False, type=int, help="How deep should we descend into the structure.") ap.add_argument('-bruteforce', dest='bruteforce', action='store_true', help="Do you want to use bruteforce?") ap.set_defaults(threads=20, depth=2, bruteforce=False) args = vars(ap.parse_args()) url = args['url'] if args['bruteforce']: word_generator = BruteForceGenerator().generator() elif args['words']: word_generator = WordList(args['words']).word_list else: print( "Please set the bruteforce flag or specify a path to a text file containing words." ) exit(0) dircrawl_manager = DircrawlManager(word_generator, url, args['threads'], args['depth']) dircrawl_manager.run() list_for_tabulate = [[request.url, request.http_code] for request in dircrawl_manager.results] print( tabulate(list_for_tabulate, headers=['URL', 'HTTP CODE'], tablefmt='grid'))
#!/usr/bin/env python from word_list import WordList __author__ = 'kreitzem' # rack = sys.argv[1].upper() wl = WordList() word = "WASTE" rack = "THAT".upper() word_num = 0 rack_num = 0 for num in range(0, len(word)-1, 1): temp_word = word[word_num] + rack[rack_num] if wl.is_a_word(temp_word): print "we got %s" % temp_word word_num = + 1 rack_num = + 1 next else: print("not a word %s" % temp_word ) word_num = + 1 next
def __init__(self, anagram_dictionary=WordList().all): self.anagram_dictionary = anagram_dictionary
def setUpClass(cls): cls.word_list = WordList('boggle_app/word_lists/en.txt')
def preprocess(data_path, is_testing, min_occurrences=5, cache_bow_output=None, cache_word2vec_output=None, duration=None, sentiment_method=None): if duration and cache_bow_output and cache_word2vec_output: data = DataInitializer() data.initialize(data_path, is_testing, duration=duration) elif cache_bow_output and cache_word2vec_output: data = DataInitializer() data.initialize(data_path, is_testing, cache_bow_output=cache_bow_output, cache_word2vec_output=cache_word2vec_output) else: data = DataInitializer() data.initialize(data_path, is_testing) if not os.path.isfile("data/Train_BTC.csv"): prices_data = GetPricesData() prices_data.main() if not os.path.isfile("data/Test_BTC.csv"): prices_data = GetPricesData() prices_data.main() data = DataCleaning(data, is_testing) data.cleanup(DataCleaner(is_testing)) if is_testing: print("Testing data shape:", data.processed_data.shape) else: print("Training data shape:", data.processed_data.shape) data = Sentiments(data, sentiment_method=sentiment_method) data.sentiment_analysis_by_text() print("First five rows with sentiment: ", data.processed_data.head()) if is_testing: data.processed_data.to_csv( "data/one_month_clean_test_data_with_prices.csv", sep=',', encoding='utf-8', index=False) # os.remove(data_path) else: data.processed_data.to_csv("data/one_month_clean_data_with_prices.csv", sep=',', encoding='utf-8', index=False) # os.remove(data_path) if os.path.isfile(cache_word2vec_output): print("cache_word2vec_output file name: ", cache_word2vec_output) word2vec_data_model = pd.read_csv(cache_word2vec_output) data.data_model = pd.read_csv(cache_bow_output) print("data model head: ", data.data_model.head(5)) else: data = DataTokenize(data) data.tokenize() data.stem() data = WordList(data) data.build_wordlist(min_occurrences=min_occurrences) word2vec_data = data data = BagOfWords(data.processed_data, data.wordlist, is_testing) data.build_data_model() print("data model head: ", data.data_model.head(5)) """ Word 2 vec """ word2vec = Word2VecProvider() # REPLACE PATH TO THE FILE word2vec.load("data/glove.twitter.27B.200d-with2num.txt") word2vec_data = TwitterData(word2vec_data) word2vec_data.build_final_model(word2vec) word2vec_data_model = word2vec_data.data_model if "original_id" in word2vec_data_model.columns: word2vec_data_model.drop("original_id", axis=1, inplace=True) word2vec_data_model.dropna(axis=0, inplace=True) word2vec_data_model.reset_index(inplace=True, drop=True) word2vec_data_model.index = word2vec_data_model['timestamp'] print("final word2vec data model: \n", word2vec_data_model.head(), "\n") # if not is_testing: # data = Plotting(data) # data.plot() if not is_testing: if not os.path.isfile("train_sequences"): print("\n##########################\n" "Tokenizing the tweets\n" "############################\n") texts = [] sentiments = [] tokenized_data = pd.DataFrame() for text in data.processed_data["text"]: texts.append(text) for sentiment in data.processed_data['sentiment']: sentiments.append(sentiment) print("texts: ", texts[0:5]) tokenizer = Tokenizer() tokenizer.fit_on_texts(texts) sequences = tokenizer.texts_to_sequences(texts) padded_sequences = pad_sequences(sequences, maxlen=20, padding='post') padded_sequences = pd.DataFrame(data=padded_sequences) merged_train_data = pd.concat([ padded_sequences, data.processed_data[[ "high", "low", "open", "quoteVolume", "volume", "weightedAverage" ]] ], axis=1) train_targets = data.processed_data[["close"]] print("shape of merged train data: ", merged_train_data.shape) with open('data/train_sequences', 'wb') as fp: pickle.dump(merged_train_data, fp) with open('data/train_prices', 'wb') as fp: pickle.dump(train_targets, fp) # load the whole embedding into memory embeddings_index = dict() with open("data/glove.twitter.27B.200d-with2num.txt", "r", encoding="utf-8") as my_file: for line in my_file: values = line.split() word = values[0] coefs = numpy.asarray(values[1:], dtype='float32') embeddings_index[word] = coefs # f.close() print("*" * 80, "\n" * 10) print('Loaded %s train word vectors.' % len(embeddings_index)) print('Total %s of word indexes.' % len(tokenizer.word_index)) with open('data/embeddings_index', 'wb') as fp: pickle.dump(embeddings_index, fp) with open('data/train_word_indexes', 'wb') as fp: pickle.dump(tokenizer.word_index, fp) # encode class values as integers # encoder = LabelEncoder() # encoder.fit(sentiments) # encoded_sentiments = encoder.transform(sentiments) # convert integers to dummy variables (i.e. one hot encoded) # dummy_sentiments = np_utils.to_categorical(encoded_sentiments) # for text in data.processed_data.loc[data.processed_data['sentiment'] != 0, "text"]: # texts.append(text) # # for sentiment in data.processed_data.loc[data.processed_data['sentiment'] != 0, "sentiment"]: # sentiments.append(sentiment) else: if not os.path.isfile("test_sequences"): print("\n##########################\n" "Tokenizing the tweets\n" "############################\n") texts = [] sentiments = [] tokenized_data = pd.DataFrame() for text in data.processed_data["text"]: texts.append(text) for sentiment in data.processed_data['sentiment']: sentiments.append(sentiment) print("texts: ", texts[0:5]) tokenizer = Tokenizer() tokenizer.fit_on_texts(texts) sequences = tokenizer.texts_to_sequences(texts) padded_sequences = pad_sequences(sequences, maxlen=20, padding='post') padded_sequences = pd.DataFrame(data=padded_sequences) merged_test_data = pd.concat([ padded_sequences, data.processed_data[[ "high", "low", "open", "quoteVolume", "volume", "weightedAverage" ]] ], axis=1) test_targets = data.processed_data[["close"]] print("shape of merged test data: ", merged_test_data.shape) with open('data/test_sequences', 'wb') as fp: pickle.dump(merged_test_data, fp) with open('data/test_prices', 'wb') as fp: pickle.dump(test_targets, fp) with open('data/test_word_indexes', 'wb') as fp: pickle.dump(tokenizer.word_index, fp) # padded_sequences = pd.DataFrame(data=padded_sequences) print( "\n\n##################################################\npadded sequence head: \n", padded_sequences[0:5]) print( "\n####################################################\n padded sequence length \n", len(padded_sequences)) if not os.path.isfile(train_data_word2vec_file_name) or not os.path.isfile( test_data_word2vec_file_name): if cache_bow_output is not None: data.data_model.to_csv(cache_bow_output, index=False, float_format="%.6f") word2vec_data_model.to_csv(cache_word2vec_output, index=False, float_format="%.6f") return data.data_model, word2vec_data_model
def test_check_word(): # create all game like variable char_mat = CharMat() char_mat.set_word("hello", (4, 4), (0, 0)) word_list = WordList() game_status = GameStatus(char_mat, word_list)
def stat_maker(): word_list = WordList().words return StatMaker(word_list)