def __init__(self, vocabulary, fixed_length, custom_wordgen=None, ignore_sentences_with_only_custom=False, masking_value=0, unknown_value=1): """ Needs a dictionary as input for the vocabulary. """ if len(vocabulary) > np.iinfo('uint16').max: raise ValueError('Dictionary is too big ({} tokens) for the numpy ' 'datatypes used (max limit={}). Reduce vocabulary' ' or adjust code accordingly!' .format(len(vocabulary), np.iinfo('uint16').max)) # Shouldn't be able to modify the given vocabulary self.vocabulary = deepcopy(vocabulary) self.fixed_length = fixed_length self.ignore_sentences_with_only_custom = ignore_sentences_with_only_custom self.masking_value = masking_value self.unknown_value = unknown_value # Initialized with an empty stream of sentences that must then be fed # to the generator at a later point for reusability. # A custom word generator can be used for domain-specific filtering etc if custom_wordgen is not None: assert custom_wordgen.stream is None self.wordgen = custom_wordgen self.uses_custom_wordgen = True else: self.wordgen = WordGenerator(None, allow_unicode_text=True, ignore_emojis=False, remove_variation_selectors=True, break_replacement=True) self.uses_custom_wordgen = False
def get_word(): task_id = request.form['task_id'] show_details = request.form['show_details'] == 'true' record = SubmitRecord.query.filter_by(task_id=task_id).first() data = json.loads(record.result) generator = WordGenerator(record.file_name, data, show_details) generator.generate() filename = generator.filename return filename
def __init__(self): self.words_generator = WordGenerator('word.yml') self.layout = BoxLayout(orientation="vertical") self.keyboard = None self.control_panel = ControlPanel(self.start_timer, self.stop_ev, self.change_difficulty) self.timer = None self.stop_event = None self.tick_event = None self.ticks = MAX_TIME self.screen = Screen() self.difficulty = "Easy" super(TouchKeyboard, self).__init__()
def main(args): gcloud_credentials = os.environ.get("GOOGLE_APPLICATION_CREDENTIALS") if not gcloud_credentials: raise RuntimeError( "Expected to set GOOGLE_APPLICATION_CREDENTIALS env var") # Remove all handlers associated with the root logger object. for handler in logging.root.handlers[:]: logging.root.removeHandler(handler) lvl = logging.DEBUG if args.verbose else logging.INFO if args.log_file: print(f"Logging to {args.log_file}") logging.basicConfig( level=lvl, filename=args.log_file, filemode="a", format="%(asctime)s - %(levelname)s - %(message)s", ) else: logging.basicConfig(level=lvl) word_generator = WordGenerator( device=args.device, forward_model_path=args.forward_model_path, inverse_model_path=args.inverse_model_path, blacklist_path=args.blacklist_path, quantize=args.quantize, ) logger.info("Uploading WOTD") client = storage.Client(project=args.gcloud_project) bucket = client.get_bucket(args.gcloud_bucket) blob = bucket.blob(args.gcloud_path) upload_wotd(blob, word_generator)
def __init__(self, vocabulary, fixed_length, custom_wordgen=None, ignore_sentences_with_only_custom=False, masking_value=0, unknown_value=1): self.vocabulary = deepcopy(vocabulary) self.fixed_length = 30 # Initialized with an empty stream of sentences that must then be fed # to the generator at a later point for reusability. # A custom word generator can be used for domain-specific filtering etc self.wordgen = WordGenerator(None, allow_unicode_text=True, ignore_emojis=False, remove_variation_selectors=True, break_replacement=True) print(self.wordgen)
class SentenceTokenizer(): """ Create numpy array of tokens corresponding to input sentences. The vocabulary can include Unicode tokens. """ def __init__(self, vocabulary, fixed_length, custom_wordgen=None, ignore_sentences_with_only_custom=False, masking_value=0, unknown_value=1): """ Needs a dictionary as input for the vocabulary. """ if len(vocabulary) > np.iinfo('uint16').max: raise ValueError('Dictionary is too big ({} tokens) for the numpy ' 'datatypes used (max limit={}). Reduce vocabulary' ' or adjust code accordingly!' .format(len(vocabulary), np.iinfo('uint16').max)) # Shouldn't be able to modify the given vocabulary self.vocabulary = deepcopy(vocabulary) self.fixed_length = fixed_length self.ignore_sentences_with_only_custom = ignore_sentences_with_only_custom self.masking_value = masking_value self.unknown_value = unknown_value # Initialized with an empty stream of sentences that must then be fed # to the generator at a later point for reusability. # A custom word generator can be used for domain-specific filtering etc if custom_wordgen is not None: assert custom_wordgen.stream is None self.wordgen = custom_wordgen self.uses_custom_wordgen = True else: self.wordgen = WordGenerator(None, allow_unicode_text=True, ignore_emojis=False, remove_variation_selectors=True, break_replacement=True) self.uses_custom_wordgen = False def tokenize_sentences(self, sentences, reset_stats=True, max_sentences=None): """ Converts a given list of sentences into a numpy array according to its vocabulary. # Arguments: sentences: List of sentences to be tokenized. reset_stats: Whether the word generator's stats should be reset. max_sentences: Maximum length of sentences. Must be set if the length cannot be inferred from the input. # Returns: Numpy array of the tokenization sentences with masking, infos, stats # Raises: ValueError: When maximum length is not set and cannot be inferred. """ if max_sentences is None and not hasattr(sentences, '__len__'): raise ValueError('Either you must provide an array with a length' 'attribute (e.g. a list) or specify the maximum ' 'length yourself using `max_sentences`!') n_sentences = (max_sentences if max_sentences is not None else len(sentences)) if self.masking_value == 0: tokens = np.zeros((n_sentences, self.fixed_length), dtype='uint16') else: tokens = (np.ones((n_sentences, self.fixed_length), dtype='uint16') * self.masking_value) if reset_stats: self.wordgen.reset_stats() # With a custom word generator info can be extracted from each # sentence (e.g. labels) infos = [] # Returns words as strings and then map them to vocabulary self.wordgen.stream = sentences next_insert = 0 n_ignored_unknowns = 0 for s_words, s_info in self.wordgen: s_tokens = self.find_tokens(s_words) if (self.ignore_sentences_with_only_custom and np.all([True if t < len(SPECIAL_TOKENS) else False for t in s_tokens])): n_ignored_unknowns += 1 continue if len(s_tokens) > self.fixed_length: s_tokens = s_tokens[:self.fixed_length] tokens[next_insert,:len(s_tokens)] = s_tokens infos.append(s_info) next_insert += 1 # For standard word generators all sentences should be tokenized # this is not necessarily the case for custom wordgenerators as they # may filter the sentences etc. if not self.uses_custom_wordgen and not self.ignore_sentences_with_only_custom: assert len(sentences) == next_insert else: # adjust based on actual tokens received tokens = tokens[:next_insert] infos = infos[:next_insert] return tokens, infos, self.wordgen.stats def find_tokens(self, words): assert len(words) > 0 tokens = [] for w in words: try: tokens.append(self.vocabulary[w]) except KeyError: tokens.append(self.unknown_value) return tokens def split_train_val_test(self, sentences, info_dicts, split_parameter=[0.7, 0.1, 0.2], extend_with=0): """ Splits given sentences into three different datasets: training, validation and testing. # Arguments: sentences: The sentences to be tokenized. info_dicts: A list of dicts that contain information about each sentence (e.g. a label). split_parameter: A parameter for deciding the splits between the three different datasets. If instead of being passed three values, three lists are passed, then these will be used to specify which observation belong to which dataset. extend_with: An optional parameter. If > 0 then this is the number of tokens added to the vocabulary from this dataset. The expanded vocab will be generated using only the training set, but is applied to all three sets. # Returns: List of three lists of tokenized sentences, List of three corresponding dictionaries with information, How many tokens have been added to the vocab. Make sure to extend the embedding layer of the model accordingly. """ # If passed three lists, use those directly if isinstance(split_parameter, list) and \ all(isinstance(x, list) for x in split_parameter) and \ len(split_parameter) == 3: # Helper function to verify provided indices are numbers in range def verify_indices(inds): return list(filter(lambda i: isinstance(i, numbers.Number) and i < len(sentences), inds)) ind_train = verify_indices(split_parameter[0]) ind_val = verify_indices(split_parameter[1]) ind_test = verify_indices(split_parameter[2]) else: # Split sentences and dicts ind = list(range(len(sentences))) ind_train, ind_test = train_test_split(ind, test_size=split_parameter[2]) ind_train, ind_val = train_test_split(ind_train, test_size=split_parameter[1]) # Map indices to data train = np.array([sentences[x] for x in ind_train]) test = np.array([sentences[x] for x in ind_test]) val = np.array([sentences[x] for x in ind_val]) info_train = np.array([info_dicts[x] for x in ind_train]) info_test = np.array([info_dicts[x] for x in ind_test]) info_val = np.array([info_dicts[x] for x in ind_val]) added = 0 # Extend vocabulary with training set tokens if extend_with > 0: wg = WordGenerator(train) vb = VocabBuilder(wg) vb.count_all_words() added = extend_vocab(self.vocabulary, vb, max_tokens=extend_with) # Wrap results result = [self.tokenize_sentences(s)[0] for s in [train, val, test]] #--------------------------------------only need this to tokenize sentences-------- result_infos = [info_train, info_val, info_test] return result, result_infos, added def to_sentence(self, sentence_idx): """ Converts a tokenized sentence back to a list of words. # Arguments: sentence_idx: List of numbers, representing a tokenized sentence given the current vocabulary. # Returns: String created by converting all numbers back to words and joined together with spaces. """ # Have to recalculate the mappings in case the vocab was extended. ind_to_word = {ind: word for word, ind in self.vocabulary.iteritems()} sentence_as_list = [ind_to_word[x] for x in sentence_idx] cleaned_list = [x for x in sentence_as_list if x != 'CUSTOM_MASK'] return " ".join(cleaned_list)
def split_train_val_test(self, sentences, info_dicts, split_parameter=[0.7, 0.1, 0.2], extend_with=0): """ Splits given sentences into three different datasets: training, validation and testing. # Arguments: sentences: The sentences to be tokenized. info_dicts: A list of dicts that contain information about each sentence (e.g. a label). split_parameter: A parameter for deciding the splits between the three different datasets. If instead of being passed three values, three lists are passed, then these will be used to specify which observation belong to which dataset. extend_with: An optional parameter. If > 0 then this is the number of tokens added to the vocabulary from this dataset. The expanded vocab will be generated using only the training set, but is applied to all three sets. # Returns: List of three lists of tokenized sentences, List of three corresponding dictionaries with information, How many tokens have been added to the vocab. Make sure to extend the embedding layer of the model accordingly. """ # If passed three lists, use those directly if isinstance(split_parameter, list) and \ all(isinstance(x, list) for x in split_parameter) and \ len(split_parameter) == 3: # Helper function to verify provided indices are numbers in range def verify_indices(inds): return list(filter(lambda i: isinstance(i, numbers.Number) and i < len(sentences), inds)) ind_train = verify_indices(split_parameter[0]) ind_val = verify_indices(split_parameter[1]) ind_test = verify_indices(split_parameter[2]) else: # Split sentences and dicts ind = list(range(len(sentences))) ind_train, ind_test = train_test_split(ind, test_size=split_parameter[2]) ind_train, ind_val = train_test_split(ind_train, test_size=split_parameter[1]) # Map indices to data train = np.array([sentences[x] for x in ind_train]) test = np.array([sentences[x] for x in ind_test]) val = np.array([sentences[x] for x in ind_val]) info_train = np.array([info_dicts[x] for x in ind_train]) info_test = np.array([info_dicts[x] for x in ind_test]) info_val = np.array([info_dicts[x] for x in ind_val]) added = 0 # Extend vocabulary with training set tokens if extend_with > 0: wg = WordGenerator(train) vb = VocabBuilder(wg) vb.count_all_words() added = extend_vocab(self.vocabulary, vb, max_tokens=extend_with) # Wrap results result = [self.tokenize_sentences(s)[0] for s in [train, val, test]] #--------------------------------------only need this to tokenize sentences-------- result_infos = [info_train, info_val, info_test] return result, result_infos, added
from flask import Flask app = Flask(__name__) from flask import request from flask import jsonify from word_generator import WordGenerator word_generator = WordGenerator() @app.route("/") def hello(): return "Hello World!" @app.route("/row_placements/") def row_placements(): letters = request.args.get('letters') if letters == None: letters = "" word_len = request.args.get('word_len') if word_len == None or len(word_len) == 0: word_len = len(letters) word_mask = request.args.get('word_mask') if word_mask == None: word_mask = "" return jsonify( word_generator.getAllPlacements(letters, int(word_len), word_mask)) @app.route("/words_unfiltered/") def words_unfiltered():
class TouchKeyboard(App): def __init__(self): self.words_generator = WordGenerator('word.yml') self.layout = BoxLayout(orientation="vertical") self.keyboard = None self.control_panel = ControlPanel(self.start_timer, self.stop_ev, self.change_difficulty) self.timer = None self.stop_event = None self.tick_event = None self.ticks = MAX_TIME self.screen = Screen() self.difficulty = "Easy" super(TouchKeyboard, self).__init__() def build(self): self.layout.add_widget(self.control_panel) self.layout.add_widget(self.screen) self.screen.difficulty.text = self.difficulty self.keyboard = Keyboard(self.screen.output, self.key_callback) self.layout.add_widget(self.keyboard) return self.layout def reshuffle_keys(self, instance): self.keyboard.generate_keyboard() def start_timer(self, instance: Button): self.control_panel.lock_controls() self.screen.output.text = "" self.screen.input.text = self.words_generator.generate_word( self.difficulty) self.timer = Timer.start(self.reshuffle_keys, CHANGE_TIME) self.tick_event = Timer.start(self.decrease_tick, 1) self.stop_event = Clock.schedule_once(self.stop_ev, MAX_TIME) def stop_ev(self, *largs): Clock.unschedule(self.timer) Clock.unschedule(self.stop_event) Clock.unschedule(self.tick_event) self.ticks = MAX_TIME self.control_panel.unlock_controls() self.screen.output.text = "" self.screen.time.text = str(self.ticks) self.screen.input.text = "" def decrease_tick(self, *largs): self.ticks -= 1 self.screen.time.text = str(self.ticks) def key_callback(self, *largs): if self.screen.output.text == self.screen.input.text: self.stop_ev() self.show_win() def show_win(self): popup = Popup(title='Winner', content=Label(text='You won', font_size='40sp'), size_hint=(None, None), size=(400, 400)) popup.open() def change_difficulty(self, *largs): if self.difficulty == 'Easy': self.difficulty = 'Medium' elif self.difficulty == 'Medium': self.difficulty = 'Hard' else: self.difficulty = 'Easy' self.screen.difficulty.text = self.difficulty
def getfile(filename): response = make_response(send_file(WordGenerator.get_file_path(filename))) response.headers['Content-Disposition'] = 'attachment;filename={};'.format( filename).encode('utf-8') return response
def play(level=1): print("##########################") print("####Let's play hangman####") print("##########################") WordGenerator.get_word() WordGenerator.guess_char(input()[0])
from word_generator import WordGenerator import re import pickle model = WordGenerator() word_list = open("first_names.all.txt", "r").read() word_list = re.sub(r"[^a-zA-Z0-9@#$%\^\\/&\*\(\):;\?!'\-\n]", "", word_list).split("\n") model.train(word_list, 3, 2) pickle.dump(model, open("model.pkl", "wb")) for approx_length in range(3, 15): print("generated word of (approximate) length", approx_length, "is", model.generate(approx_length))
def main(args): api_key = os.environ.get("TWITTER_API_KEY") api_secret = os.environ.get("TWITTER_API_SECRET") access_token = os.environ.get("TWITTER_ACCESS_TOKEN") access_secret = os.environ.get("TWITTER_ACCESS_SECRET") app_name = os.environ.get("TWITTER_APP_NAME") if not api_key: raise RuntimeError("Missing TWITTER_API_KEY environment variable") if not api_secret: raise RuntimeError("Missing TWITTER_API_SECRET environment variable") if not access_token: raise RuntimeError("Missing TWITTER_ACCESS_TOKEN environment variable") if not access_secret: raise RuntimeError("Missing TWITTER_ACCESS_SECRET environment variable") if not app_name: raise RuntimeError("Missing TWITTER_APP_NAME environment variable") # Remove all handlers associated with the root logger object. for handler in logging.root.handlers[:]: logging.root.removeHandler(handler) lvl = logging.DEBUG if args.verbose else logging.INFO if args.log_file: print(f"Logging to {args.log_file}") logging.basicConfig( level=lvl, filename=args.log_file, filemode="a", format="%(asctime)s - %(levelname)s - %(message)s", ) else: logging.basicConfig(level=lvl) auth = tweepy.OAuthHandler(api_key, api_secret) auth.set_access_token(access_token, access_secret) api = tweepy.API( auth, wait_on_rate_limit=True, wait_on_rate_limit_notify=True, retry_count=5, retry_delay=30, retry_errors=set([500, 503]), ) word_generator = WordGenerator( device=args.device, forward_model_path=args.forward_model_path, inverse_model_path=args.inverse_model_path, blacklist_path=args.blacklist_path, quantize=args.quantize, ) me = api.me() if args.wotd_mode: logger.info("Tweeting WOTD") tweet_wotd(me, api, word_generator) else: try: last_processed_id = _fetch_last_processed_id(api, app_name) except StopIteration: if args.bootstrap: logger.warning("Bootstrapping bot with no last replied to id") last_processed_id = None else: raise RuntimeError("Unable to determine last replied to id") logger.info(f"Entering bot loop - starting from {last_processed_id}") bot_loop(me, api, word_generator, last_processed_id)
#! /usr/bin/env python import cPickle import string from word_generator import WordGenerator #Generate and pickle a WordGenerator for fast loading later ngrams = 3 wordlist = open("/usr/share/dict/words").readlines() wordlist = [word.strip() for word in wordlist if all(c in string.ascii_lowercase for c in word.strip())] w = WordGenerator(wordlist, ngrams) w.finalize_probabilities() cPickle.dump(w, open("word_gen{0}.pickle".format(ngrams), "wb"))
[ Word( word=w.word, definition=w.definition, pos=w.pos, topic=w.topic, example=clean_example(w.word, w.example), syllables=h_en.syllables(w.word), probably_exists=False, dataset_type=wordservice_pb2.DatasetType.UD_UNFILTERED, ) for w in words ] ) wi.dump_encrypted("../website/data/words_ud_unfiltered.enc.gz", fernet_key=os.environ.get("FERNET_ENCRYPTION_KEY")) wg = WordGenerator( device="cuda:0", forward_model_path="/mnt/evo/projects/title-maker-pro/models/urban_dictionary_250_cleaned_lr_00005_b9_seed4/checkpoint-140000", inverse_model_path=None, blacklist_path="/mnt/evo/projects/title-maker-pro/models/blacklist.pickle", quantize=False, is_urban=True, ) wg.generate_definition("cummy") from word_service.word_service_proto import wordservice_pb# model = AutoModelWithLMHead.from_pretrained("/mnt/evo/projects/title-maker-pro/models/urban_dictionary_250_cleaned_lr_00005_b9_seed4/checkpoint-140000").to("cuda:0")