def calculateKeywords(self):
		self.keywords = WordList()
		self.avoids = WordList()
		self.wordAvg = 0
		self.avgRatio = 0
		usef = 0
		usel = 0

		for data in self.wl.words:
			self.wordAvg += data.occ
			usef += data.usef
			usel += data.usel

		l = len(self.wl.words)
		if l == 0:
			l = 1
		self.wordAvg = self.wordAvg / l
		if usel == 0:
			self.avgRatio = usef
		else:
			self.avgRatio = usef / usel
		

		for data in self.wl.words:
			if data.usel == 0:
				ratio = data.usef
			else:
				ratio = data.usef / data.usel

			if ratio > self.avgRatio * self.ratioDiff:
				self.keywords.set(data.word, data.occ, data.usef, data.usel)
			elif ratio < self.avgRatio / self.ratioDiff:
				self.avoids.set(data.word, data.occ, data.usef, data.usel)
	def getWords(self, tree):
		wl = WordList()
		for data in tree.findall('words/word'):
			word = data.text
			occ = int(data.get('occured'))
			usf = int(data.get('useful'))
			usl = int(data.get('useless'))
			wl.set(word,occ,usf,usl)
		return wl
def correct_words(classified_words, actual_words, dictionary_word_list, names_word_list, mode='single'):
    corrected_words = classified_words[:]
    for word_index in xrange(len(corrected_words)):
        current_word = corrected_words[word_index]
        lowercase_word = corrected_words[word_index].lower()

        # Don't carry out correction for valid single letters
        valid_single_letters = 'ai'

        if not lowercase_word in valid_single_letters:
            # Preserve capitalisation of first letter of each word
            capitalised = current_word[0].isupper()

            if not dictionary_word_list.word_match(lowercase_word) and not names_word_list.word_match(lowercase_word):
                name_word = names_word_list.correct_word(lowercase_word, isCapitalised=capitalised, mode=mode)
                dictionary_word = dictionary_word_list.correct_word(lowercase_word, isCapitalised=capitalised, mode=mode)

                # Skip loop if no valid corrections were found
                if name_word is None and dictionary_word is None:
                    continue
                # If no name words were found, use the dictionary word
                elif name_word is None and dictionary_word is not None:
                    current_word = dictionary_word
                # If no dictionary words were found, use the name word
                elif name_word is not None and dictionary_word is None:
                    current_word = name_word
                # See if word is closer to a name or a dictionary word and correct it to the closer one
                elif WordList.word_difference(current_word, name_word) < dictionary_word_list.word_difference(current_word, dictionary_word):
                    current_word = name_word
                else:
                    current_word = dictionary_word

        corrected_words[word_index] = current_word

    return corrected_words
Example #4
0
def test_solve():
    solver = Solver()
    wordlist = WordList().words
    dealt = 'JREZQXW'
    expected = [
        'ER', 'EX', 'JEW', 'RE', 'REW', 'REX', 'REZ', 'WE', 'WEX', 'ZEX'
    ]
    assert solver.solve(dealt) == expected
 def test_init(self):
     char_mat = CharMat()
     word_list = WordList()
     game_status = GameStatus(char_mat, word_list)
     self.assertEqual(game_status.word_list, word_list)
     self.assertEqual(game_status.char_mat, char_mat)
     self.assertEqual(game_status.score, 0)
     self.assertEqual(game_status.time, 60)
     self.assertEqual(game_status.times_up, False)
     self.assertEqual(game_status.success, False)
    def __init__(self, grid, word_list_filename, print_func=print):
        """Inits WordSearch from an grid of letters and from
        input word-list text file.

        Args:
            grid (list) - letters grid as list of n m-letters str
            word_list_filename (str) - full path input word-list text file.
            print_func (func(x)) - [optional] print word function
        """
        self.set_grid(grid)
        self._word_list = WordList(word_list_filename)
        self._print_func = print_func
        self._no_found_words = 0
Example #7
0
def create_word_list(file_name):
    """Creates WordList object from  input word-list text file.

    Args:
        file_name (str) - full path input word-list UTF-8 text file
            each word in single line.

    Returns:
         The created WordList object
    """
    print("creating word-list from " + str(file_name) + " file... ")
    word_list = WordList("word.list")
    print("[DONE]")
    return word_list
	def storeWords(self):
		self.wl = WordList()
		xReader = XMLReader()
		xParser = XMLParser()

		if xReader.checkIfExistsQuiet('xml/words.xml'):
			tree = xReader.getTree('xml/words.xml')
			wordAvg, avgRatio = xParser.getGeneralFromWords(tree)
			self.wl = xParser.getWords(tree)

		usf = 0
		usl = 0
		if self.vote == "up":
			usf = 1
		else:
			usl = 1

		for ind, obj in enumerate(self.XMLInspections):
			if obj.ID != self.voteId:
				continue

			pl = PageLoader(obj.fil)
			if not pl.isReadable():
				print('Abort. File not readable:', obj.fil)
				exit()
			pl.read()

			patt = "^[a-zA-Z0-9]*$"
			pl.linkWords = self.removeListElesNotPatterned(patt, pl.linkWords)
			pl.titleWords = self.removeListElesNotPatterned(patt, pl.titleWords)
			pl.headerWords = self.removeListElesNotPatterned(patt, pl.headerWords)
			pl.specialWords = self.removeListElesNotPatterned(patt, pl.specialWords)
			pl.normalWords = self.removeListElesNotPatterned(patt, pl.normalWords)

			for word in pl.linkWords:
				self.wl.append(word, usf, usl)
			for word in pl.titleWords:
				self.wl.append(word, usf, usl)
			for word in pl.headerWords:
				self.wl.append(word, usf, usl)
			for word in pl.specialWords:
				self.wl.append(word, usf, usl)
			for word in pl.normalWords:
				self.wl.append(word, usf, usl)
			return
Example #9
0
#!/usr/bin/env python
from word_list import WordList

__author__ = 'kreitzem'

# rack = sys.argv[1].upper()
wl = WordList()

word = "WASTE"
rack = "THAT".upper()

word_num = 0
rack_num = 0

for num in range(0, len(word)-1, 1):
    temp_word = word[word_num] + rack[rack_num]
    if wl.is_a_word(temp_word):
        print "we got %s" % temp_word
        word_num = + 1
        rack_num = + 1
        next
    else:
        print("not a word %s" % temp_word )
        word_num = + 1
        next


Example #10
0
 def __init__(self, anagram_dictionary=WordList().all):
     self.anagram_dictionary = anagram_dictionary
Example #11
0
 def setUpClass(cls):
     cls.word_list = WordList('boggle_app/word_lists/en.txt')
def stat_maker():
    word_list = WordList().words
    return StatMaker(word_list)
Example #13
0
 def toUpper(self, name, prefix = '', suffix = ''):
     wl = WordList()
     wl.fromSnakeCase(prefix + self.data[name] + suffix)
     return wl.toUCase()
 def test_check_word():
     # create all game like variable
     char_mat = CharMat()
     char_mat.set_word("hello", (4, 4), (0, 0))
     word_list = WordList()
     game_status = GameStatus(char_mat, word_list)
class MainUpdater():
	"""docstring for MainUpdater"""
	vote = None
	voteId = None
	miPath = None
	XMLInspections = []
	wl = None
	wordAvg = None
	avgRatio = None
	ratioDiff = 2
	keywords = None
	avoids = None

	def __init__(self, vote, voteId, miPath):
		self.vote = vote
		self.voteId = voteId
		self.miPath = miPath
		self.wordXMLPath = "xml/words.xml"

	def loadMasterInspection(self):
		insp = Inspector()
		self.XMLInspections = insp.getInspectionsStr(self.miPath)
		if len(self.XMLInspections) == 0:
			print('Abort. No data found in', self.miPath)
			exit()
		
	def storeWords(self):
		self.wl = WordList()
		xReader = XMLReader()
		xParser = XMLParser()

		if xReader.checkIfExistsQuiet('xml/words.xml'):
			tree = xReader.getTree('xml/words.xml')
			wordAvg, avgRatio = xParser.getGeneralFromWords(tree)
			self.wl = xParser.getWords(tree)

		usf = 0
		usl = 0
		if self.vote == "up":
			usf = 1
		else:
			usl = 1

		for ind, obj in enumerate(self.XMLInspections):
			if obj.ID != self.voteId:
				continue

			pl = PageLoader(obj.fil)
			if not pl.isReadable():
				print('Abort. File not readable:', obj.fil)
				exit()
			pl.read()

			patt = "^[a-zA-Z0-9]*$"
			pl.linkWords = self.removeListElesNotPatterned(patt, pl.linkWords)
			pl.titleWords = self.removeListElesNotPatterned(patt, pl.titleWords)
			pl.headerWords = self.removeListElesNotPatterned(patt, pl.headerWords)
			pl.specialWords = self.removeListElesNotPatterned(patt, pl.specialWords)
			pl.normalWords = self.removeListElesNotPatterned(patt, pl.normalWords)

			for word in pl.linkWords:
				self.wl.append(word, usf, usl)
			for word in pl.titleWords:
				self.wl.append(word, usf, usl)
			for word in pl.headerWords:
				self.wl.append(word, usf, usl)
			for word in pl.specialWords:
				self.wl.append(word, usf, usl)
			for word in pl.normalWords:
				self.wl.append(word, usf, usl)
			return

	def removeListElesNotPatterned(self, patt, li, maxLen = 255):
		indList = []
		for i in range(len(li)):
			if re.match(patt, li[i]) == None or len(li[i]) > maxLen:
				indList.append(i)
		for i in reversed(range(len(indList))):
			li.pop(indList[i])
		return li

	def deleteFile(self):
		for ind, obj in enumerate(self.XMLInspections):
			if obj.ID == self.voteId:
				os = OSTool()
				os.deleteFile(obj.fil)
				return

	def writeWordsXML(self):
		xWriter = XMLWriter()
		xWriter.writeWordXML(self.wl,self.wordAvg,self.avgRatio,self.wordXMLPath)

	def calculateKeywords(self):
		self.keywords = WordList()
		self.avoids = WordList()
		self.wordAvg = 0
		self.avgRatio = 0
		usef = 0
		usel = 0

		for data in self.wl.words:
			self.wordAvg += data.occ
			usef += data.usef
			usel += data.usel

		l = len(self.wl.words)
		if l == 0:
			l = 1
		self.wordAvg = self.wordAvg / l
		if usel == 0:
			self.avgRatio = usef
		else:
			self.avgRatio = usef / usel
		

		for data in self.wl.words:
			if data.usel == 0:
				ratio = data.usef
			else:
				ratio = data.usef / data.usel

			if ratio > self.avgRatio * self.ratioDiff:
				self.keywords.set(data.word, data.occ, data.usef, data.usel)
			elif ratio < self.avgRatio / self.ratioDiff:
				self.avoids.set(data.word, data.occ, data.usef, data.usel)

	def updateKeywordsXML(self):
		xWriter = XMLWriter()
		xWriter.writeKeywordsXML(self.keywords, self.avoids, 'xml/keywords.xml')
	
	def updateSitesXMl(self):
		xReader = XMLReader()
		xParser = XMLParser()
		xWriter = XMLWriter()
		tree = xReader.getTree('xml/sites.xml')
		gdSites, bdSites = xParser.getSites(tree)
		data = None
		for obj in self.XMLInspections:
			if obj.ID == self.voteId:
				data = obj
				break
		if self.vote == "up":
			gdSites.append(obj.url)
		else:
			bdSites.append(obj.url)
		xWriter.writeSitesXML(gdSites,bdSites,'xml/sites.xml')

	def getXMLInspScored(self):
		p = PageToXML(self.XMLInspections, self.keywords, self.avoids)
		self.XMLInspections = p.getScore()
		return self.XMLInspections
Example #16
0
    ap.add_argument('-depth',
                    required=False,
                    type=int,
                    help="How deep should we descend into the structure.")
    ap.add_argument('-bruteforce',
                    dest='bruteforce',
                    action='store_true',
                    help="Do you want to use bruteforce?")
    ap.set_defaults(threads=20, depth=2, bruteforce=False)

    args = vars(ap.parse_args())
    url = args['url']
    if args['bruteforce']:
        word_generator = BruteForceGenerator().generator()
    elif args['words']:
        word_generator = WordList(args['words']).word_list
    else:
        print(
            "Please set the bruteforce flag or specify a path to a text file containing words."
        )
        exit(0)
    dircrawl_manager = DircrawlManager(word_generator, url, args['threads'],
                                       args['depth'])
    dircrawl_manager.run()
    list_for_tabulate = [[request.url, request.http_code]
                         for request in dircrawl_manager.results]
    print(
        tabulate(list_for_tabulate,
                 headers=['URL', 'HTTP CODE'],
                 tablefmt='grid'))
def preprocess(data_path,
               is_testing,
               min_occurrences=5,
               cache_bow_output=None,
               cache_word2vec_output=None,
               duration=None):
    if duration:
        data = DataInitializer()
        data.initialize(data_path, is_testing, duration=duration)
    else:
        data = DataInitializer()
        data.initialize(data_path, is_testing)

    if os.path.isfile("data/BTC.csv"):
        prices_data = GetPricesData()
        prices_data.main()

    data = DataCleaning(data, is_testing)
    data.cleanup(DataCleaner(is_testing))

    if is_testing:
        print("Testing data shape:", data.processed_data.shape)
    else:
        print("Training data shape:", data.processed_data.shape)

    data = Sentiments(data)
    data.sentiment_analysis_by_text()
    print("First five rows with sentiment: ", data.processed_data.head())
    if is_testing:
        data.processed_data.to_csv("data/clean_test_with_sentiments.csv",
                                   sep=',',
                                   encoding='utf-8',
                                   index=False)
        # os.remove(data_path)
    else:
        data.processed_data.to_csv("data/clean_train_with_sentiments.csv",
                                   sep=',',
                                   encoding='utf-8',
                                   index=False)
        # os.remove(data_path)

    data = DataTokenize(data)
    data.tokenize()
    data.stem()

    data = WordList(data)
    data.build_wordlist(min_occurrences=min_occurrences)

    word2vec_data = data
    data = BagOfWords(data.processed_data, data.wordlist, is_testing)
    data.build_data_model()
    print("data model head: ", data.data_model.head(5))
    """
    Word 2 vec
    """

    word2vec = Word2VecProvider()

    # REPLACE PATH TO THE FILE
    word2vec.load("../twitter/data/glove.twitter.27B.200d.txt")

    word2vec_data = RedditData(word2vec_data)
    word2vec_data.build_final_model(word2vec)

    word2vec_data_model = word2vec_data.data_model
    if "index" in word2vec_data_model.columns:
        word2vec_data_model.drop("index", axis=1, inplace=True)
    word2vec_data_model.dropna(axis=0, inplace=True)
    word2vec_data_model.reset_index(inplace=True)
    word2vec_data_model.index = word2vec_data_model['timestamp_ms']
    print("final word2vec data model: \n", word2vec_data_model.head(), "\n")
    """
    Tokenizing the data
    """
    texts = []
    sentiments = []
    tokenized_data = pd.DataFrame()
    for text in data.processed_data["summary"]:
        texts.append(text)
    for sentiment in data.processed_data["sentiment"]:
        sentiments.append(sentiment)
    print("texts: ", texts[0:5])
    tokenizer = Tokenizer(num_words=20000)
    tokenizer.fit_on_texts(texts)
    sequences = tokenizer.texts_to_sequences(texts)
    padded_sequences = pad_sequences(sequences, maxlen=200)

    print(
        "\n\n##################################################\npadded sequence head: \n",
        padded_sequences[0:5])
    print(
        "\n####################################################\n padded sequence length \n",
        len(padded_sequences))

    if not is_testing:
        data = Plotting(data)
        data.plot()

    if cache_bow_output is not None:
        data.data_model.to_csv(cache_bow_output,
                               index=False,
                               float_format="%.6f")
        word2vec_data_model.to_csv(cache_word2vec_output,
                                   index=False,
                                   float_format="%.6f")
        with open('sequences', 'wb') as fp:
            pickle.dump(padded_sequences, fp)
        with open('sentiments', 'wb') as fp:
            pickle.dump(sentiments, fp)

    return data.data_model, word2vec_data_model
Example #18
0
def preprocess(data_path,
               is_testing,
               min_occurrences=5,
               cache_bow_output=None,
               cache_word2vec_output=None,
               duration=None,
               sentiment_method=None):
    if duration and cache_bow_output and cache_word2vec_output:
        data = DataInitializer()
        data.initialize(data_path, is_testing, duration=duration)
    elif cache_bow_output and cache_word2vec_output:
        data = DataInitializer()
        data.initialize(data_path,
                        is_testing,
                        cache_bow_output=cache_bow_output,
                        cache_word2vec_output=cache_word2vec_output)
    else:
        data = DataInitializer()
        data.initialize(data_path, is_testing)

    if not os.path.isfile("data/Train_BTC.csv"):
        prices_data = GetPricesData()
        prices_data.main()

    if not os.path.isfile("data/Test_BTC.csv"):
        prices_data = GetPricesData()
        prices_data.main()

    data = DataCleaning(data, is_testing)
    data.cleanup(DataCleaner(is_testing))

    if is_testing:
        print("Testing data shape:", data.processed_data.shape)
    else:
        print("Training data shape:", data.processed_data.shape)

    data = Sentiments(data, sentiment_method=sentiment_method)
    data.sentiment_analysis_by_text()

    print("First five rows with sentiment: ", data.processed_data.head())
    if is_testing:
        data.processed_data.to_csv(
            "data/one_month_clean_test_data_with_prices.csv",
            sep=',',
            encoding='utf-8',
            index=False)
        # os.remove(data_path)
    else:
        data.processed_data.to_csv("data/one_month_clean_data_with_prices.csv",
                                   sep=',',
                                   encoding='utf-8',
                                   index=False)
        # os.remove(data_path)

    if os.path.isfile(cache_word2vec_output):
        print("cache_word2vec_output file name: ", cache_word2vec_output)
        word2vec_data_model = pd.read_csv(cache_word2vec_output)
        data.data_model = pd.read_csv(cache_bow_output)
        print("data model head: ", data.data_model.head(5))
    else:
        data = DataTokenize(data)
        data.tokenize()
        data.stem()

        data = WordList(data)
        data.build_wordlist(min_occurrences=min_occurrences)

        word2vec_data = data
        data = BagOfWords(data.processed_data, data.wordlist, is_testing)
        data.build_data_model()
        print("data model head: ", data.data_model.head(5))
        """
        Word 2 vec
        """

        word2vec = Word2VecProvider()

        # REPLACE PATH TO THE FILE
        word2vec.load("data/glove.twitter.27B.200d-with2num.txt")
        word2vec_data = TwitterData(word2vec_data)
        word2vec_data.build_final_model(word2vec)
        word2vec_data_model = word2vec_data.data_model

        if "original_id" in word2vec_data_model.columns:
            word2vec_data_model.drop("original_id", axis=1, inplace=True)
        word2vec_data_model.dropna(axis=0, inplace=True)
        word2vec_data_model.reset_index(inplace=True, drop=True)
        word2vec_data_model.index = word2vec_data_model['timestamp']

    print("final word2vec data model: \n", word2vec_data_model.head(), "\n")

    # if not is_testing:
    #     data = Plotting(data)
    #     data.plot()

    if not is_testing:
        if not os.path.isfile("train_sequences"):
            print("\n##########################\n"
                  "Tokenizing the tweets\n"
                  "############################\n")
            texts = []
            sentiments = []
            tokenized_data = pd.DataFrame()

            for text in data.processed_data["text"]:
                texts.append(text)

            for sentiment in data.processed_data['sentiment']:
                sentiments.append(sentiment)

            print("texts: ", texts[0:5])
            tokenizer = Tokenizer()
            tokenizer.fit_on_texts(texts)
            sequences = tokenizer.texts_to_sequences(texts)
            padded_sequences = pad_sequences(sequences,
                                             maxlen=20,
                                             padding='post')
            padded_sequences = pd.DataFrame(data=padded_sequences)

            merged_train_data = pd.concat([
                padded_sequences, data.processed_data[[
                    "high", "low", "open", "quoteVolume", "volume",
                    "weightedAverage"
                ]]
            ],
                                          axis=1)
            train_targets = data.processed_data[["close"]]
            print("shape of merged train data: ", merged_train_data.shape)

            with open('data/train_sequences', 'wb') as fp:
                pickle.dump(merged_train_data, fp)
            with open('data/train_prices', 'wb') as fp:
                pickle.dump(train_targets, fp)

            # load the whole embedding into memory
            embeddings_index = dict()
            with open("data/glove.twitter.27B.200d-with2num.txt",
                      "r",
                      encoding="utf-8") as my_file:
                for line in my_file:
                    values = line.split()
                    word = values[0]
                    coefs = numpy.asarray(values[1:], dtype='float32')
                    embeddings_index[word] = coefs
            # f.close()
            print("*" * 80, "\n" * 10)
            print('Loaded %s train word vectors.' % len(embeddings_index))
            print('Total %s of word indexes.' % len(tokenizer.word_index))

            with open('data/embeddings_index', 'wb') as fp:
                pickle.dump(embeddings_index, fp)
            with open('data/train_word_indexes', 'wb') as fp:
                pickle.dump(tokenizer.word_index, fp)

            # encode class values as integers
            # encoder = LabelEncoder()
            # encoder.fit(sentiments)
            # encoded_sentiments = encoder.transform(sentiments)

            # convert integers to dummy variables (i.e. one hot encoded)
            # dummy_sentiments = np_utils.to_categorical(encoded_sentiments)

            # for text in data.processed_data.loc[data.processed_data['sentiment'] != 0, "text"]:
            #     texts.append(text)
            #
            # for sentiment in data.processed_data.loc[data.processed_data['sentiment'] != 0, "sentiment"]:
            #     sentiments.append(sentiment)

    else:
        if not os.path.isfile("test_sequences"):
            print("\n##########################\n"
                  "Tokenizing the tweets\n"
                  "############################\n")
            texts = []
            sentiments = []
            tokenized_data = pd.DataFrame()

            for text in data.processed_data["text"]:
                texts.append(text)

            for sentiment in data.processed_data['sentiment']:
                sentiments.append(sentiment)

            print("texts: ", texts[0:5])
            tokenizer = Tokenizer()
            tokenizer.fit_on_texts(texts)
            sequences = tokenizer.texts_to_sequences(texts)
            padded_sequences = pad_sequences(sequences,
                                             maxlen=20,
                                             padding='post')
            padded_sequences = pd.DataFrame(data=padded_sequences)

            merged_test_data = pd.concat([
                padded_sequences, data.processed_data[[
                    "high", "low", "open", "quoteVolume", "volume",
                    "weightedAverage"
                ]]
            ],
                                         axis=1)
            test_targets = data.processed_data[["close"]]
            print("shape of merged test data: ", merged_test_data.shape)

            with open('data/test_sequences', 'wb') as fp:
                pickle.dump(merged_test_data, fp)
            with open('data/test_prices', 'wb') as fp:
                pickle.dump(test_targets, fp)
            with open('data/test_word_indexes', 'wb') as fp:
                pickle.dump(tokenizer.word_index, fp)

            # padded_sequences = pd.DataFrame(data=padded_sequences)

    print(
        "\n\n##################################################\npadded sequence head: \n",
        padded_sequences[0:5])
    print(
        "\n####################################################\n padded sequence length \n",
        len(padded_sequences))

    if not os.path.isfile(train_data_word2vec_file_name) or not os.path.isfile(
            test_data_word2vec_file_name):
        if cache_bow_output is not None:
            data.data_model.to_csv(cache_bow_output,
                                   index=False,
                                   float_format="%.6f")
            word2vec_data_model.to_csv(cache_word2vec_output,
                                       index=False,
                                       float_format="%.6f")
    return data.data_model, word2vec_data_model