Exemple #1
0
def test_solve():
    solver = Solver()
    wordlist = WordList().words
    dealt = 'JREZQXW'
    expected = [
        'ER', 'EX', 'JEW', 'RE', 'REW', 'REX', 'REZ', 'WE', 'WEX', 'ZEX'
    ]
    assert solver.solve(dealt) == expected
 def test_init(self):
     char_mat = CharMat()
     word_list = WordList()
     game_status = GameStatus(char_mat, word_list)
     self.assertEqual(game_status.word_list, word_list)
     self.assertEqual(game_status.char_mat, char_mat)
     self.assertEqual(game_status.score, 0)
     self.assertEqual(game_status.time, 60)
     self.assertEqual(game_status.times_up, False)
     self.assertEqual(game_status.success, False)
    def __init__(self, grid, word_list_filename, print_func=print):
        """Inits WordSearch from an grid of letters and from
        input word-list text file.

        Args:
            grid (list) - letters grid as list of n m-letters str
            word_list_filename (str) - full path input word-list text file.
            print_func (func(x)) - [optional] print word function
        """
        self.set_grid(grid)
        self._word_list = WordList(word_list_filename)
        self._print_func = print_func
        self._no_found_words = 0
Exemple #4
0
def create_word_list(file_name):
    """Creates WordList object from  input word-list text file.

    Args:
        file_name (str) - full path input word-list UTF-8 text file
            each word in single line.

    Returns:
         The created WordList object
    """
    print("creating word-list from " + str(file_name) + " file... ")
    word_list = WordList("word.list")
    print("[DONE]")
    return word_list
def preprocess(data_path,
               is_testing,
               min_occurrences=5,
               cache_bow_output=None,
               cache_word2vec_output=None,
               duration=None):
    if duration:
        data = DataInitializer()
        data.initialize(data_path, is_testing, duration=duration)
    else:
        data = DataInitializer()
        data.initialize(data_path, is_testing)

    if os.path.isfile("data/BTC.csv"):
        prices_data = GetPricesData()
        prices_data.main()

    data = DataCleaning(data, is_testing)
    data.cleanup(DataCleaner(is_testing))

    if is_testing:
        print("Testing data shape:", data.processed_data.shape)
    else:
        print("Training data shape:", data.processed_data.shape)

    data = Sentiments(data)
    data.sentiment_analysis_by_text()
    print("First five rows with sentiment: ", data.processed_data.head())
    if is_testing:
        data.processed_data.to_csv("data/clean_test_with_sentiments.csv",
                                   sep=',',
                                   encoding='utf-8',
                                   index=False)
        # os.remove(data_path)
    else:
        data.processed_data.to_csv("data/clean_train_with_sentiments.csv",
                                   sep=',',
                                   encoding='utf-8',
                                   index=False)
        # os.remove(data_path)

    data = DataTokenize(data)
    data.tokenize()
    data.stem()

    data = WordList(data)
    data.build_wordlist(min_occurrences=min_occurrences)

    word2vec_data = data
    data = BagOfWords(data.processed_data, data.wordlist, is_testing)
    data.build_data_model()
    print("data model head: ", data.data_model.head(5))
    """
    Word 2 vec
    """

    word2vec = Word2VecProvider()

    # REPLACE PATH TO THE FILE
    word2vec.load("../twitter/data/glove.twitter.27B.200d.txt")

    word2vec_data = RedditData(word2vec_data)
    word2vec_data.build_final_model(word2vec)

    word2vec_data_model = word2vec_data.data_model
    if "index" in word2vec_data_model.columns:
        word2vec_data_model.drop("index", axis=1, inplace=True)
    word2vec_data_model.dropna(axis=0, inplace=True)
    word2vec_data_model.reset_index(inplace=True)
    word2vec_data_model.index = word2vec_data_model['timestamp_ms']
    print("final word2vec data model: \n", word2vec_data_model.head(), "\n")
    """
    Tokenizing the data
    """
    texts = []
    sentiments = []
    tokenized_data = pd.DataFrame()
    for text in data.processed_data["summary"]:
        texts.append(text)
    for sentiment in data.processed_data["sentiment"]:
        sentiments.append(sentiment)
    print("texts: ", texts[0:5])
    tokenizer = Tokenizer(num_words=20000)
    tokenizer.fit_on_texts(texts)
    sequences = tokenizer.texts_to_sequences(texts)
    padded_sequences = pad_sequences(sequences, maxlen=200)

    print(
        "\n\n##################################################\npadded sequence head: \n",
        padded_sequences[0:5])
    print(
        "\n####################################################\n padded sequence length \n",
        len(padded_sequences))

    if not is_testing:
        data = Plotting(data)
        data.plot()

    if cache_bow_output is not None:
        data.data_model.to_csv(cache_bow_output,
                               index=False,
                               float_format="%.6f")
        word2vec_data_model.to_csv(cache_word2vec_output,
                                   index=False,
                                   float_format="%.6f")
        with open('sequences', 'wb') as fp:
            pickle.dump(padded_sequences, fp)
        with open('sentiments', 'wb') as fp:
            pickle.dump(sentiments, fp)

    return data.data_model, word2vec_data_model
Exemple #6
0
    ap.add_argument('-depth',
                    required=False,
                    type=int,
                    help="How deep should we descend into the structure.")
    ap.add_argument('-bruteforce',
                    dest='bruteforce',
                    action='store_true',
                    help="Do you want to use bruteforce?")
    ap.set_defaults(threads=20, depth=2, bruteforce=False)

    args = vars(ap.parse_args())
    url = args['url']
    if args['bruteforce']:
        word_generator = BruteForceGenerator().generator()
    elif args['words']:
        word_generator = WordList(args['words']).word_list
    else:
        print(
            "Please set the bruteforce flag or specify a path to a text file containing words."
        )
        exit(0)
    dircrawl_manager = DircrawlManager(word_generator, url, args['threads'],
                                       args['depth'])
    dircrawl_manager.run()
    list_for_tabulate = [[request.url, request.http_code]
                         for request in dircrawl_manager.results]
    print(
        tabulate(list_for_tabulate,
                 headers=['URL', 'HTTP CODE'],
                 tablefmt='grid'))
Exemple #7
0
#!/usr/bin/env python
from word_list import WordList

__author__ = 'kreitzem'

# rack = sys.argv[1].upper()
wl = WordList()

word = "WASTE"
rack = "THAT".upper()

word_num = 0
rack_num = 0

for num in range(0, len(word)-1, 1):
    temp_word = word[word_num] + rack[rack_num]
    if wl.is_a_word(temp_word):
        print "we got %s" % temp_word
        word_num = + 1
        rack_num = + 1
        next
    else:
        print("not a word %s" % temp_word )
        word_num = + 1
        next


Exemple #8
0
 def __init__(self, anagram_dictionary=WordList().all):
     self.anagram_dictionary = anagram_dictionary
Exemple #9
0
 def setUpClass(cls):
     cls.word_list = WordList('boggle_app/word_lists/en.txt')
Exemple #10
0
def preprocess(data_path,
               is_testing,
               min_occurrences=5,
               cache_bow_output=None,
               cache_word2vec_output=None,
               duration=None,
               sentiment_method=None):
    if duration and cache_bow_output and cache_word2vec_output:
        data = DataInitializer()
        data.initialize(data_path, is_testing, duration=duration)
    elif cache_bow_output and cache_word2vec_output:
        data = DataInitializer()
        data.initialize(data_path,
                        is_testing,
                        cache_bow_output=cache_bow_output,
                        cache_word2vec_output=cache_word2vec_output)
    else:
        data = DataInitializer()
        data.initialize(data_path, is_testing)

    if not os.path.isfile("data/Train_BTC.csv"):
        prices_data = GetPricesData()
        prices_data.main()

    if not os.path.isfile("data/Test_BTC.csv"):
        prices_data = GetPricesData()
        prices_data.main()

    data = DataCleaning(data, is_testing)
    data.cleanup(DataCleaner(is_testing))

    if is_testing:
        print("Testing data shape:", data.processed_data.shape)
    else:
        print("Training data shape:", data.processed_data.shape)

    data = Sentiments(data, sentiment_method=sentiment_method)
    data.sentiment_analysis_by_text()

    print("First five rows with sentiment: ", data.processed_data.head())
    if is_testing:
        data.processed_data.to_csv(
            "data/one_month_clean_test_data_with_prices.csv",
            sep=',',
            encoding='utf-8',
            index=False)
        # os.remove(data_path)
    else:
        data.processed_data.to_csv("data/one_month_clean_data_with_prices.csv",
                                   sep=',',
                                   encoding='utf-8',
                                   index=False)
        # os.remove(data_path)

    if os.path.isfile(cache_word2vec_output):
        print("cache_word2vec_output file name: ", cache_word2vec_output)
        word2vec_data_model = pd.read_csv(cache_word2vec_output)
        data.data_model = pd.read_csv(cache_bow_output)
        print("data model head: ", data.data_model.head(5))
    else:
        data = DataTokenize(data)
        data.tokenize()
        data.stem()

        data = WordList(data)
        data.build_wordlist(min_occurrences=min_occurrences)

        word2vec_data = data
        data = BagOfWords(data.processed_data, data.wordlist, is_testing)
        data.build_data_model()
        print("data model head: ", data.data_model.head(5))
        """
        Word 2 vec
        """

        word2vec = Word2VecProvider()

        # REPLACE PATH TO THE FILE
        word2vec.load("data/glove.twitter.27B.200d-with2num.txt")
        word2vec_data = TwitterData(word2vec_data)
        word2vec_data.build_final_model(word2vec)
        word2vec_data_model = word2vec_data.data_model

        if "original_id" in word2vec_data_model.columns:
            word2vec_data_model.drop("original_id", axis=1, inplace=True)
        word2vec_data_model.dropna(axis=0, inplace=True)
        word2vec_data_model.reset_index(inplace=True, drop=True)
        word2vec_data_model.index = word2vec_data_model['timestamp']

    print("final word2vec data model: \n", word2vec_data_model.head(), "\n")

    # if not is_testing:
    #     data = Plotting(data)
    #     data.plot()

    if not is_testing:
        if not os.path.isfile("train_sequences"):
            print("\n##########################\n"
                  "Tokenizing the tweets\n"
                  "############################\n")
            texts = []
            sentiments = []
            tokenized_data = pd.DataFrame()

            for text in data.processed_data["text"]:
                texts.append(text)

            for sentiment in data.processed_data['sentiment']:
                sentiments.append(sentiment)

            print("texts: ", texts[0:5])
            tokenizer = Tokenizer()
            tokenizer.fit_on_texts(texts)
            sequences = tokenizer.texts_to_sequences(texts)
            padded_sequences = pad_sequences(sequences,
                                             maxlen=20,
                                             padding='post')
            padded_sequences = pd.DataFrame(data=padded_sequences)

            merged_train_data = pd.concat([
                padded_sequences, data.processed_data[[
                    "high", "low", "open", "quoteVolume", "volume",
                    "weightedAverage"
                ]]
            ],
                                          axis=1)
            train_targets = data.processed_data[["close"]]
            print("shape of merged train data: ", merged_train_data.shape)

            with open('data/train_sequences', 'wb') as fp:
                pickle.dump(merged_train_data, fp)
            with open('data/train_prices', 'wb') as fp:
                pickle.dump(train_targets, fp)

            # load the whole embedding into memory
            embeddings_index = dict()
            with open("data/glove.twitter.27B.200d-with2num.txt",
                      "r",
                      encoding="utf-8") as my_file:
                for line in my_file:
                    values = line.split()
                    word = values[0]
                    coefs = numpy.asarray(values[1:], dtype='float32')
                    embeddings_index[word] = coefs
            # f.close()
            print("*" * 80, "\n" * 10)
            print('Loaded %s train word vectors.' % len(embeddings_index))
            print('Total %s of word indexes.' % len(tokenizer.word_index))

            with open('data/embeddings_index', 'wb') as fp:
                pickle.dump(embeddings_index, fp)
            with open('data/train_word_indexes', 'wb') as fp:
                pickle.dump(tokenizer.word_index, fp)

            # encode class values as integers
            # encoder = LabelEncoder()
            # encoder.fit(sentiments)
            # encoded_sentiments = encoder.transform(sentiments)

            # convert integers to dummy variables (i.e. one hot encoded)
            # dummy_sentiments = np_utils.to_categorical(encoded_sentiments)

            # for text in data.processed_data.loc[data.processed_data['sentiment'] != 0, "text"]:
            #     texts.append(text)
            #
            # for sentiment in data.processed_data.loc[data.processed_data['sentiment'] != 0, "sentiment"]:
            #     sentiments.append(sentiment)

    else:
        if not os.path.isfile("test_sequences"):
            print("\n##########################\n"
                  "Tokenizing the tweets\n"
                  "############################\n")
            texts = []
            sentiments = []
            tokenized_data = pd.DataFrame()

            for text in data.processed_data["text"]:
                texts.append(text)

            for sentiment in data.processed_data['sentiment']:
                sentiments.append(sentiment)

            print("texts: ", texts[0:5])
            tokenizer = Tokenizer()
            tokenizer.fit_on_texts(texts)
            sequences = tokenizer.texts_to_sequences(texts)
            padded_sequences = pad_sequences(sequences,
                                             maxlen=20,
                                             padding='post')
            padded_sequences = pd.DataFrame(data=padded_sequences)

            merged_test_data = pd.concat([
                padded_sequences, data.processed_data[[
                    "high", "low", "open", "quoteVolume", "volume",
                    "weightedAverage"
                ]]
            ],
                                         axis=1)
            test_targets = data.processed_data[["close"]]
            print("shape of merged test data: ", merged_test_data.shape)

            with open('data/test_sequences', 'wb') as fp:
                pickle.dump(merged_test_data, fp)
            with open('data/test_prices', 'wb') as fp:
                pickle.dump(test_targets, fp)
            with open('data/test_word_indexes', 'wb') as fp:
                pickle.dump(tokenizer.word_index, fp)

            # padded_sequences = pd.DataFrame(data=padded_sequences)

    print(
        "\n\n##################################################\npadded sequence head: \n",
        padded_sequences[0:5])
    print(
        "\n####################################################\n padded sequence length \n",
        len(padded_sequences))

    if not os.path.isfile(train_data_word2vec_file_name) or not os.path.isfile(
            test_data_word2vec_file_name):
        if cache_bow_output is not None:
            data.data_model.to_csv(cache_bow_output,
                                   index=False,
                                   float_format="%.6f")
            word2vec_data_model.to_csv(cache_word2vec_output,
                                       index=False,
                                       float_format="%.6f")
    return data.data_model, word2vec_data_model
 def test_check_word():
     # create all game like variable
     char_mat = CharMat()
     char_mat.set_word("hello", (4, 4), (0, 0))
     word_list = WordList()
     game_status = GameStatus(char_mat, word_list)
def stat_maker():
    word_list = WordList().words
    return StatMaker(word_list)