def main(): """ Description: Main function """ # Argument parsing args = parse_arguments() # Create the directory if it does not exist. try: os.makedirs(args.output_dir) except OSError as e: if e.errno != errno.EEXIST: raise # Creating word list if args.dict: lang_dict = [] if os.path.isfile(args.dict): with open(args.dict, "r", encoding="utf8", errors="ignore") as d: lang_dict = [l for l in d.read().splitlines() if len(l) > 0] else: sys.exit("Cannot open dict") else: lang_dict = load_dict(args.language) with open('./trdg/dicts/en.txt', 'r', encoding='utf8', errors='ignore') as f: en_dict = [i for i in f.read().splitlines() if len(i) > 0] # Create font (path) list if args.font_dir: fonts = [ os.path.join(args.font_dir, p) for p in os.listdir(args.font_dir) if os.path.splitext(p)[1] == ".ttf" ] elif args.font: if os.path.isfile(args.font): fonts = [args.font] else: sys.exit("Cannot open font") else: fonts = load_fonts(args.language) # Creating synthetic sentences (or word) strings = [] if args.use_wikipedia: print('use_wikipedia') strings = create_strings_from_wikipedia(args.length, args.count, args.language) elif args.input_file != "": print('input_file') strings = create_strings_from_file(args.input_file, args.count) elif args.random_sequences: print('random_sequences') strings = create_strings_randomly( args.length, args.random, args.count, args.include_letters, args.include_numbers, args.include_symbols, args.language, ) # Set a name format compatible with special characters automatically if they are used if args.include_symbols or True not in ( args.include_letters, args.include_numbers, args.include_symbols, ): args.name_format = 2 # else : # print('create_strings_from_dict') # strings = create_strings_from_dict( # args.length, # args.random, # args.count, # lang_dict # ) else: print(make_my_strings) strings = make_my_strings( args.count, lang_dict, en_dict, ) if args.language == "ar": from arabic_reshaper import ArabicReshaper arabic_reshaper = ArabicReshaper() strings = [ " ".join([arabic_reshaper.reshape(w) for w in s.split(" ")[::-1]]) for s in strings ] if args.case == "upper": strings = [x.upper() for x in strings] if args.case == "lower": strings = [x.lower() for x in strings] string_count = len(strings) p = Pool(args.thread_count) for _ in tqdm( p.imap_unordered( FakeTextDataGenerator.generate_from_tuple, zip( [i for i in range(0, string_count)], strings, [ fonts[rnd.randrange(0, len(fonts))] for _ in range(0, string_count) ], [args.output_dir] * string_count, [args.format] * string_count, [args.extension] * string_count, [args.skew_angle] * string_count, [args.random_skew] * string_count, [args.blur] * string_count, [args.random_blur] * string_count, [args.background] * string_count, [args.distorsion] * string_count, [args.distorsion_orientation] * string_count, [args.handwritten] * string_count, [args.name_format] * string_count, [args.width] * string_count, [args.alignment] * string_count, [args.text_color] * string_count, [args.orientation] * string_count, [args.space_width] * string_count, [args.character_spacing] * string_count, [args.margins] * string_count, [args.fit] * string_count, [args.output_mask] * string_count, [args.word_split] * string_count, [args.image_dir] * string_count, ), ), total=args.count, ): pass p.terminate() if args.name_format == 2: # Create file with filename-to-label connections with open(os.path.join(args.output_dir, "labels.txt"), "w", encoding="utf8") as f: for i in range(string_count): file_name = str(i) + "." + args.extension f.write("{}\t{}\n".format(file_name, strings[i]))
def __getitem__(self, index): # We are actually only getting one item here obvoiusly! num_words = randint(5, 6) language = "en" count = 32 skew_angle = 0 random_skew = True blur = 0 random_blur = True background = randint(0, 1) distorsion = randint(0, 2) distorsion_orientation = randint(0, 2) # as class function def take_10(x): if len(x) > num_words: return x[0:num_words] else: return x # if random.random()>0.8: # self.width=random.randint(500,800) # else: width = -1 alignment = random.randint(0, 2) strings = [] # try: # strings = create_strings_from_wikipedia(1, 1, "en") # except: strings = create_strings_randomly(num_words, False, 1, True, True, True, "en") strings = [" ".join(take_10(word_tokenize(x))) for x in strings] strings_ = [list(x) for x in strings] self.strings_int = [[self.dictionary[x.lower()] if x.lower() in self.keys else 67 for x in m] for m in strings_] self.strings_len = [len(x) for x in self.strings_int] string_count = len(strings) # What we do here is we space the words quite far appart. if random.random() > 0.8: width_scale = random.random() * 900 space_width = width_scale / np.sum(self.strings_len) else: # width_scale=random.randint(50,100)/100 # width_scale=1 # space_width=width_scale/np.sum(self.strings_len) space_width = 2 # random.randint(50,100)/100 if random.random() > 0.85 and np.max(self.strings_len) < 30: width = random.randint(500, 800) image_list = [np.expand_dims(np.array(FakeTextDataGenerator.generate(*j)), 0) for j in zip( [i for i in range(0, string_count)], strings, [self.fonts[random.randrange(0, len(self.fonts))] for _ in range(0, string_count)], [self.format] * string_count, [self.extension] * string_count, [skew_angle] * string_count, [random_skew] * string_count, [blur] * string_count, [random_blur] * string_count, [background] * string_count, [distorsion] * string_count, [distorsion_orientation] * string_count, [self.handwritten] * string_count, [self.name_format] * string_count, [width] * string_count, [alignment] * string_count, [self.text_color] * string_count, [self.orientation] * string_count, [space_width] * string_count)] X = image_list[0] y = self.strings_int[0] y_len = len(y) # Here we include some random horizontal lines cause they appear quite often in real life. if random.random() > 0.8: for j in range(random.randint(0, 3)): random_channel = random.randint(0, 2) random_h = random.randint(0, 31) random_w_s = random.randint(0, int(X.shape[2] / 2)) random_w_e = random.randint(int(X.shape[2] / 2), int(X.shape[2])) X[0, random_h, random_w_s:random_w_e, random_channel] = random.randint(0, 255) if self.transform: X = self.seq.augment_images(X) # X=np.squeeze(X) # X=np.expand_dims(X,0) X = X / 255 x_len = X.shape[2] return X, y, x_len, y_len
def main(): """ Description: Main function """ # Argument parsing args = parse_arguments() # Create the directory if it does not exist. try: os.makedirs(args.output_dir) except OSError as e: if e.errno != errno.EEXIST: raise # Creating word list lang_dict = load_dict(args.language) # Create font (path) list fonts = load_fonts(args.language) # Creating synthetic sentences (or word) strings = [] if args.use_wikipedia: strings = create_strings_from_wikipedia(args.length, args.count, args.language) elif args.input_file != '': strings = create_strings_from_file(args.input_file, args.count) elif args.random_sequences: strings = create_strings_randomly(args.length, args.random, args.count, args.include_letters, args.include_numbers, args.include_symbols, args.language) # Set a name format compatible with special characters automatically if they are used if args.include_symbols or True not in (args.include_letters, args.include_numbers, args.include_symbols): args.name_format = 2 else: strings = create_strings_from_dict(args.length, args.random, args.count, lang_dict) string_count = len(strings) p = Pool(args.thread_count) for _ in tqdm(p.imap_unordered( FakeTextDataGenerator.generate_from_tuple, zip( [i for i in range(0, string_count)], strings, [ fonts[random.randrange(0, len(fonts))] for _ in range(0, string_count) ], [args.output_dir] * string_count, [args.format] * string_count, [args.extension] * string_count, [args.skew_angle] * string_count, [args.random_skew] * string_count, [args.blur] * string_count, [args.random_blur] * string_count, [args.background] * string_count, [args.distorsion] * string_count, [args.distorsion_orientation] * string_count, [args.handwritten] * string_count, [args.name_format] * string_count, )), total=args.count): pass p.terminate() if args.name_format == 2: # Create file with filename-to-label connections with open(os.path.join(args.output_dir, "labels.txt"), 'w', encoding="utf8") as f: for i in range(string_count): file_name = str(i) + "." + args.extension f.write("{} {}\n".format(file_name, strings[i]))
def main(): """ Description: Main function """ # Argument parsing args = parse_arguments() # Create the directory if it does not exist. try: os.makedirs(args.output_dir) except OSError as e: if e.errno != errno.EEXIST: raise # Creating word list lang_dict = load_dict(args.language) # Create font (path) list fonts = load_fonts(args.language) text_colors = [(255, 255, 255, 255),(255, 255, 0, 255)] bg_paths = glob.glob('/Users/yaqi.wang/projects/burned_in_captions/data/captions/background/*/*.png') bg_paths += glob.glob('/Users/yaqi.wang/projects/burned_in_captions/data/captions/background/*/*.bmp') print("len(bg_paths):",len(bg_paths)) # Creating synthetic sentences (or word) strings = [] caption_file_names = glob.glob(args.input_file+'/es/*captions.txt') for i_file,caption_file_name in enumerate(caption_file_names): print("pricessing {}th file".format(i_file)) out_path = os.path.dirname(caption_file_name) if len(glob.glob(out_path+'/*.png')) ==100: print("pass!") continue # print(out_path) if args.use_wikipedia: strings = create_strings_from_wikipedia(args.length, args.count, args.language) elif args.input_file != '': strings = create_strings_from_file(caption_file_name, args.count) elif args.random_sequences: strings = create_strings_randomly(args.length, args.random, args.count, args.include_letters, args.include_numbers, args.include_symbols, args.language) # Set a name format compatible with special characters automatically if they are used if args.include_symbols or True not in (args.include_letters, args.include_numbers, args.include_symbols): args.name_format = 2 else: print("create_strings_from_dict!") strings = create_strings_from_dict(args.length, args.random, args.count, lang_dict) string_count = len(strings) # string_count = len(fonts) sampled_image_paths = [random.choice(bg_paths) for _ in range(0, string_count)] p = Pool(args.thread_count) for _ in tqdm(p.imap_unordered( FakeTextDataGenerator.generate_from_tuple, zip( [i for i in range(0, string_count)], strings, # [fonts[random.randrange(0, len(fonts))] for _ in range(0, string_count)], [fonts[i%len(fonts)] for i in range(0, string_count)], [out_path] * string_count, [args.format] * string_count, [args.extension] * string_count, [args.skew_angle] * string_count, [args.random_skew] * string_count, [args.blur] * string_count, [args.random_blur] * string_count, [args.background] * string_count, [args.distorsion] * string_count, [args.distorsion_orientation] * string_count, [args.handwritten] * string_count, [args.name_format] * string_count, # [args.width] * string_count, [int(math.fabs(random.gauss(args.margin,2))) for _ in range(string_count)], # [args.alignment] * string_count, [random_pick([0,1,2],[0.2,0.6,0.2]) for _ in range(string_count)], # [args.text_color] * string_count, [random_pick(text_colors,[0.9,.1]) for _ in range(string_count)], [random_pick([0,1,2],[0.2,0.7,0.1]) for _ in range(string_count)], sampled_image_paths #contour_width ) ), total=args.count): pass p.terminate() if args.name_format == 2: # Create file with filename-to-label connections with open(os.path.join(out_path, "labels.txt"), 'w', encoding="utf8") as f: for i in range(string_count): file_name = str(i) + "." + args.extension f.write("{} {}\n".format(file_name, strings[i]))
def main(): """ Description: Main function """ # Argument parsing args_default = parse_arguments() # Create the directory if it does not exist. try: os.makedirs(args_default.output_dir) except OSError as e: if e.errno != errno.EEXIST: raise if args_default.tf_flag == 1: try: os.makedirs(args_default.tfr_output_dir) except OSError as e: if e.errno != errno.EEXIST: raise # Creating word list lang_dict = load_dict(args_default.language) print('lang_dict size -->' + str(len(lang_dict))) # Create font (path) list if not args_default.font: fonts = load_fonts(args_default.language) else: if os.path.isfile(args_default.font): fonts = [args_default.font] else: sys.exit("Cannot open font") # Creating synthetic sentences (or word) strings = [] word_labels = [] if args_default.use_wikipedia: strings = create_strings_from_wikipedia(args_default.length, args_default.count, args_default.language) elif args_default.input_file != '': strings, word_labels = create_strings_from_file( args_default.input_file, args_default.count, lang_dict) elif args_default.random_sequences: strings = create_strings_randomly( args_default.length, args_default.random, args_default.count, args_default.include_letters, args_default.include_numbers, args_default.include_symbols, args_default.language) # Set a name format compatible with special characters automatically if they are used if args_default.include_symbols or True not in ( args_default.include_letters, args_default.include_numbers, args_default.include_symbols): args_default.name_format = 2 else: strings, word_labels = create_strings_from_dict( args_default.length, args_default.random, args_default.count, lang_dict) string_count = len(strings) argses = argsesCreator(string_count, args_default) tr_portion = 0.9 te_uplimit = 1000 tr_st = 0 tr_ed = np.max([int(string_count * tr_portion), string_count - te_uplimit]) te_st = tr_ed + 1 te_ed = string_count print('TOTAL DATA GENERATE: ' + str(string_count)) print('TRAIN DATA: ' + str(tr_ed)) print('TEST DATA: ' + str(string_count - tr_ed) + ' (' + str(int(10000 * (1 - (tr_ed + 1) / string_count)) / 100) + '%)') # Parallel mode. n_parallel = args_default.thread_count if n_parallel < 2: main_parallel(0, 1, fonts, argses[tr_st:tr_ed], strings[tr_st:tr_ed], word_labels[tr_st:tr_ed], 'tran') else: p = Pool(processes=n_parallel) for pid in range(0, n_parallel): p.apply_async(main_parallel, ( pid, n_parallel, fonts, argses[tr_st:tr_ed], strings[tr_st:tr_ed], word_labels[tr_st:tr_ed], 'train', )) p.close() p.join() # Generate test data. main_parallel(0, 1, fonts, argses[te_st:te_ed], strings[te_st:te_ed], word_labels[te_st:te_ed], 'test') print('Done!') if args_default.name_format == 2: # Create file with filename-to-label connections with open(os.path.join(args_default.output_dir, "labels.txt"), 'w', encoding="utf8") as f: for i in range(string_count): file_name = str(i) + "." + args_default.extension f.write("{} {}\n".format(file_name, strings[i]))
def main(): """ Description: Main function """ # Argument parsing args = parse_arguments() # Create the directory if it does not exist. try: if os.path.exists(args.output_dir) == True: shutil.rmtree(args.output_dir) os.makedirs(args.output_dir) else: os.makedirs(args.output_dir) except OSError as e: if e.errno != errno.EEXIST: raise # Creating word list lang_dict = load_dict(args.language) # Create font (path) list if not args.font: fonts = load_fonts(args.language) else: if os.path.isfile(args.font): fonts = [args.font] else: sys.exit("Cannot open font") # Creating synthetic sentences (or word) strings = [] if args.use_wikipedia: strings = create_strings_from_wikipedia(args.length, args.count, args.language) elif args.input_file != '': strings = create_strings_from_file(args.input_file, args.count) elif args.random_sequences: strings = create_strings_randomly(args.length, args.random, args.count, args.include_letters, args.include_numbers, args.include_symbols, args.language) # Set a name format compatible with special characters automatically if they are used if args.include_symbols or True not in (args.include_letters, args.include_numbers, args.include_symbols): args.name_format = 2 else: strings = create_strings_from_dict(args.length, args.random, args.count, lang_dict) string_count = len(strings) # Random BG color colorBGList = [] backgroundList = [] if args.background == 0 or args.background == 1 or args.background == 2 or args.background == 3: for i in range(args.count): colorBGList.append(args.background) backgroundList.append(args.background) elif args.background == 4: for i in range(args.count): if args.background_color_mode == "rndInList": colorBGList = RandomBackgroundColorInList(args.count) backgroundList.append(4) elif args.background_color_mode == "rnd": colorBGList = RandomBackgroundColor(args.count) backgroundList.append(4) elif args.background == 5: for i in range(args.count): args.background = random.randint(0, 4) if args.background == 4: args.background = 4 if args.background_color_mode == "rndInList": colorBGList.append(colorList[random.randint( 0, len(colorList) - 1)]) backgroundList.append(4) elif args.background_color_mode == "rnd": colorBGList.append( (random.randint(0, 255), random.randint(0, 255), random.randint(0, 255), 1)) backgroundList.append(4) else: rndBackground = random.randint(0, 3) colorBGList.append(rndBackground) backgroundList.append(rndBackground) # Random text color if args.text_color == 'rndInList': colorTextList = RandomTextColorInList(args.count) elif args.text_color == 'rnd': colorTextList = RandomTextColor(args.count) else: colorTextList = [] for i in range(args.count): colorTextList.append(args.text_color) # Random font fontList = [] for i in range(args.count): fontList.append(fonts[random.randrange(0, len(fonts) - 1)]) # Distorsion list distorsionList = [] if args.distorsion == 3: for i in range(args.count): distorsionList.append(random.randint(0, 2)) else: for i in range(args.count): distorsionList.append(args.distorsion) # Skew & Blur list blurList = [] skewList = [] if args.random_blur_and_skew == True: for i in range(args.count): blurList.append(random.choice([True, False])) skewList.append(random.choice([True, False])) else: for i in range(args.count): blurList.append(args.random_blur) blurList.append(args.random_skew) # Font size List fontSizeList = [] ratioList = [] for i in range(args.count): if args.random_font_size == True: ratio = percentRatioList[random.randint( 0, len(percentRatioList) - 1)] # random.randint(0,len(percentRatioList)-1) fontSizeList.append(ratio) ratioList.append(int(ratio)) elif args.random_font_size == False: fontSizeList.append(int(args.format)) ratioList.append(int(args.format / args.format * 100)) p = Pool(args.thread_count) for _ in tqdm(p.imap_unordered( FakeTextDataGenerator.generate_from_tuple, zip([i for i in range(0, string_count)], strings, [e for e in fontList], [args.output_dir] * string_count, [args.format] * string_count, [args.extension] * string_count, [args.skew_angle] * string_count, [e for e in skewList], [args.blur] * string_count, [e for e in blurList], [e for e in backgroundList], [e for e in distorsionList], [args.distorsion_orientation] * string_count, [args.handwritten] * string_count, [args.name_format] * string_count, [args.width] * string_count, [args.alignment] * string_count, [e for e in colorTextList] * string_count, [args.orientation] * string_count, [args.space_width] * string_count, [args.margins] * string_count, [args.fit] * string_count, [e for e in colorBGList], [e for e in fontSizeList])), total=args.count): pass p.terminate() if args.name_format == 2: # Create file with filename-to-label connections with open(os.path.join(args.output_dir, "labels.txt"), 'w', encoding="utf-8") as f: for i in range(string_count): file_name = str(i) + "." + args.extension f.write("{} {}\n".format(file_name, strings[i])) print(strings[i]) elif args.name_format == 3: from PIL import Image dataframe = [] for i in range(args.count): im = Image.open(args.output_dir + str(i) + '.jpg') fontName = fontList[i] if colorBGList[i] == 0: colorBGList[i] = "gaussian noise" elif colorBGList[i] == 1: colorBGList[i] = "plain white" elif colorBGList[i] == 2: colorBGList[i] = "quasicrystal" elif colorBGList[i] == 3: colorBGList[i] = "picture" elif type(colorBGList[i]) == type((0, 0, 0, 0)): colorBGList[i] = "color background " + str(colorBGList[i]) if distorsionList[i] == 0: distorsionList[i] = "None (Default)" elif distorsionList[i] == 1: distorsionList[i] = "Sine wave" elif distorsionList[i] == 2: distorsionList[i] = "Cosine wave" tupleData = (i, strings[i], fontName[10:], str(ratioList[i]) + "%", colorTextList[i], colorBGList[i], im.size, distorsionList[i], blurList[i], skewList[i]) dataframe.append(tupleData) print(tupleData) CreateReport(dataframe)
def main(): """ Description: Main function """ # Argument parsing args = parse_arguments() # Create the directory if it does not exist. try: os.makedirs(args.output_dir) except OSError as e: if e.errno != errno.EEXIST: raise # Creating word list lang_dict = load_dict(args.language) # Create font (path) list if not args.font: fonts = load_fonts(args.language) else: if os.path.isfile(args.font): fonts = [args.font] else: sys.exit("Cannot open font") # Creating synthetic sentences (or word) strings, targets = [], [] random_sentence, orig = [], [] inp = 'n' if args.use_wikipedia: strings = create_strings_from_wikipedia(args.length, args.count, args.language) print("sourcing from random wiki page...", args.use_wikipedia) input("Press enter...") elif args.input_file != '': strings = create_strings_from_file(args.input_file, args.count) print("sourcing from input file: ", args.input_file) print(args.language) if args.language == 'cn': KanjiArray, StrokeArray = [], [] for strobj in strings: kanji, stroke = strobj.split(",") KanjiArray.append(kanji) StrokeArray.append(stroke) strings = KanjiArray[:] print("entered ", KanjiArray[len(KanjiArray)-1], strings[len(KanjiArray)-1]) inp = input("NLP test? (y/n) ") if inp is 'y': # file = os.path.join(os.getcwd(), 'Wordcount_record.csv') print('Starting NLP-test' + os.linesep + ' ** generating sentences...') strings, random_sentence, targets = generate_random_get_targets(strings) print(strings[0] + targets[0]) print(random_sentence[0] + targets[0]) print(' ** generating random sentences and retrieving targets...') string_last = len(strings) input("Press enter to generate images...") elif args.random_sequences: strings = create_strings_randomly(args.length, args.random, args.count, args.include_letters, args.include_numbers, args.include_symbols, args.language) # Set a name format compatible with special characters automatically if they are used if args.include_symbols or True not in (args.include_letters, args.include_numbers, args.include_symbols): args.name_format = 2 else: strings = create_strings_from_dict(args.length, args.random, args.count, lang_dict) if args.case == 'upper': strings = [x.upper() for x in strings] if args.case == 'lower': strings = [x.lower() for x in strings] string_count = len(strings) # Store random values used at FakeTextDataGenerator Class RandNums = [] # Apply effect onto targets first then clean args if inp is 'y': orig = strings strings = targets args.output_dir = "out_target/" args.margins = (5, 0, 5, 5) args.name_format = 2 try: os.makedirs(args.output_dir) except OSError as e: if e.errno != errno.EEXIST: raise print(' ** applying effects onto targets...') p = Pool(args.thread_count) for _ in tqdm(p.imap_unordered( FakeTextDataGenerator.generate_from_tuple, zip( [i for i in range(0, string_count)], strings, [fonts[rnd.randrange(0, len(fonts))] for _ in range(0, string_count)], # [determine_out_dir(i, string_last, args.output_dir) for i in range(0, string_count)], [args.output_dir] * string_count, [args.format] * string_count, [args.extension] * string_count, [args.skew_angle] * string_count, [args.random_skew] * string_count, [args.blur] * string_count, [args.random_blur] * string_count, [args.background] * string_count, [args.distorsion] * string_count, [args.distorsion_orientation] * string_count, [args.handwritten] * string_count, [args.name_format] * string_count, [args.width] * string_count, [args.alignment] * string_count, [args.text_color] * string_count, [args.orientation] * string_count, [args.space_width] * string_count, [args.margins] * string_count, [args.fit] * string_count ) ), total=args.count): RandNums.append(_) pass p.terminate() if inp is 'y': strings = orig print(' ** generating clean images for sentences without target...') args.output_dir = "out/" args.margins = (5, 5, 5, 0) # store and reset effect variables at args orig_blur = args.blur orig_random_blur = args.random_blur args.blur = 0 args.background = 1 args.random_blur = False p = Pool(args.thread_count) for _ in tqdm(p.imap_unordered( FakeTextDataGenerator.generate_from_tuple, zip( [i for i in range(0, string_count)], strings, [fonts[rnd.randrange(0, len(fonts))] for _ in range(0, string_count)], # [determine_out_dir(i, string_last, args.output_dir) for i in range(0, string_count)], [args.output_dir] * string_count, [args.format] * string_count, [args.extension] * string_count, [args.skew_angle] * string_count, [args.random_skew] * string_count, [args.blur] * string_count, [args.random_blur] * string_count, [args.background] * string_count, [args.distorsion] * string_count, [args.distorsion_orientation] * string_count, [args.handwritten] * string_count, [args.name_format] * string_count, [args.width] * string_count, [args.alignment] * string_count, [args.text_color] * string_count, [args.orientation] * string_count, [args.space_width] * string_count, [args.margins] * string_count, [args.fit] * string_count ) ), total=args.count): # RandNums.append(_) pass p.terminate() rand_sent = random_sentence[:] strings = random_sentence print(' ** generating clean images out of random sentences...') args.output_dir = "out_random/" try: os.makedirs(args.output_dir) except OSError as e: if e.errno != errno.EEXIST: raise p = Pool(args.thread_count) for _ in tqdm(p.imap_unordered( FakeTextDataGenerator.generate_from_tuple, zip( [i for i in range(0, string_count)], strings, [fonts[rnd.randrange(0, len(fonts))] for _ in range(0, string_count)], # [determine_out_dir(i, string_last, args.output_dir) for i in range(0, string_count)], [args.output_dir] * string_count, [args.format] * string_count, [args.extension] * string_count, [args.skew_angle] * string_count, [args.random_skew] * string_count, [args.blur] * string_count, [args.random_blur] * string_count, [args.background] * string_count, [args.distorsion] * string_count, [args.distorsion_orientation] * string_count, [args.handwritten] * string_count, [args.name_format] * string_count, [args.width] * string_count, [args.alignment] * string_count, [args.text_color] * string_count, [args.orientation] * string_count, [args.space_width] * string_count, [args.margins] * string_count, [args.fit] * string_count ) ), total=args.count): # RandNums.append(_) pass p.terminate() args.blur = orig_blur args.random_blur = orig_random_blur # concatenate images and go back to original string concatenate_images() for index in range(len(strings)): rand_sent[index] += targets[index] strings[index] = orig[index] + targets[index] # print(RandNums, os.linesep) if args.name_format == 2: # Create file with filename-to-label connections with open(os.path.join(args.output_dir, "labels.txt"), 'w', encoding="utf-8_sig") as f: for i in range(string_count): file_name = str(i) + "." + args.extension f.write("{} {}\n".format(file_name, strings[i])) # Write to Csv tag_array = [] for arg in vars(args): tag_array.append(arg) tag_array.insert(0, "Content") tag_array.insert(2, "File name") if args.language == 'cn': tag_array.insert(3, "Num of strokes") fileout = 'PyCsvExp.csv' with open(fileout, 'w', encoding="utf-8_sig") as csvFile: writer = csv.writer(csvFile) writer.writerow(tag_array) row_keeper = [] for i in range(string_count): attr_array = [] # retrieve rand from class # print(FakeTextDataGenerator.generate_from_tuple()) for arg in vars(args): if arg is 'blur': if args.random_blur is False: RandNums[i][0] = args.blur attr_array.append(RandNums[i][0]) elif arg is 'length': attr_array.append(len(strings[i])) else: attr_array.append(getattr(args, arg)) # print(arg, getattr(args, arg)) attr_array.insert(0, strings[i]) if inp is 'y': attr_array.insert(2, "concat_" + RandNums[i][2]) else: attr_array.insert(2, RandNums[i][2]) if args.language == 'cn': attr_array.insert(3, StrokeArray[i]) # writer.writerow(attr_array) row_keeper.append(attr_array) if inp is 'y': sorted_rows = sorted(row_keeper, key=lambda row: int(row[2][:-4].strip("concat_")), reverse=False) # row[0].lower() else: sorted_rows = sorted(row_keeper, key=lambda row: int(row[2].strip(".jpg")), reverse=False) # row[0].lower() writer.writerows(sorted_rows) csvFile.close() if inp is 'y': # Write to Csv tag_array = [] for arg in vars(args): tag_array.append(arg) tag_array.insert(0, "Content") tag_array.insert(2, "file name") fileout = 'PyCsvExpRandom.csv' with open(fileout, 'w', encoding="utf-8_sig") as csvFile: writer = csv.writer(csvFile) writer.writerow(tag_array) row_keeper = [] for i in range(string_count): attr_array = [] # retrieve rand from class # print(FakeTextDataGenerator.generate_from_tuple()) for arg in vars(args): if arg is 'blur': if args.random_blur is False: RandNums[i][0] = args.blur attr_array.append(RandNums[i][0]) elif arg is 'length': attr_array.append(len(rand_sent[i])) else: attr_array.append(getattr(args, arg)) # print(arg, getattr(args, arg)) attr_array.insert(0, rand_sent[i]) attr_array.insert(2, "concat_random_" + RandNums[i][2]) # writer.writerow(attr_array) row_keeper.append(attr_array) sorted_rows = sorted(row_keeper, key=lambda row: int(row[2][:-4].strip("concat_random_")), reverse=False) # row[0].lower() writer.writerows(sorted_rows) csvFile.close()
def __init__(self, batch_size, epoch_size=10, random_strings=True, num_words=5, transform=False, width=-1, alignment=1, height=32): 'Initialization' #General args: self.transform = transform self.random_strings = random_strings self.num_words = num_words #How much data we want to generate in a single epoch self.epoch_size = epoch_size self.batch_size = batch_size def take_10(x): if len(x) > self.num_words: return x[0:self.num_words] else: return x #Text gen specific stuff self.thread_count = 6 self.language = "en" self.count = 32 #If we want to use random seq, alternatively we use Wikipedia ( needs internet) self.random_sequences = False self.include_letters = True self.include_numbers = True self.include_symbols = True #When source wikipedia, how many words we want to include self.length = 10 #If we want to have variable word lengths, "length is maximum) self.random = True #The height of the image self.format = height #Skeqing angle self.skew_angle = 0 self.random_skew = True self.use_wikipedia = True self.blur = 0 self.random_blur = True """Define what kind of background to use. 0: Gaussian Noise, 1: Plain white, 2: Quasicrystal, 3: Pictures""", self.background = 1 """Define a distorsion applied to the resulting image. 0: None (Default), 1: Sine wave, 2: Cosine wave, 3: Random""", self.distorsion = 0 """Define the distorsion's orientation. Only used if -d is specified. 0: Vertical (Up and down), 1: Horizontal (Left and Right), 2: Both""" self.distorsion_orientation = 0 self.width = width self.orientation = 0 self.text_color = '#282828' self.space_width = 0.1 self.extension = "jpg" self.handwritten = False self.name_format = 0 self.alignment = alignment #This shoule be done on init We also do string generation in the init of the dataset pool = '' pool += "abcdefghijklmnopqrstuvwxyz" pool += "0123456789" pool += "!\"#$%&'()*+,-./:;?@[\\]^_`{|}~" pool += ' ' self.keys = list(pool) self.values = np.array(range(1, len(pool) + 1)) self.dictionary = dict(zip(self.keys, self.values)) self.fonts = load_fonts("en") self.decode_dict = dict((v, k) for k, v in self.dictionary.items()) self.decode_dict.update({67: "OOK"}) ###Get strings strings = [] #We try to load strings from wikipedia, if not we return random strings. if self.random_strings == False: try: #Num words just doesnt work here I think it takes one sentence always strings = create_strings_from_wikipedia( 1, self.batch_size * self.epoch_size, "en") except: print("Connection issues") strings = create_strings_randomly(self.num_words, False, batch_size * epoch_size, True, True, True, "en") else: strings = create_strings_randomly(self.num_words, False, batch_size * epoch_size, True, True, True, "en") ###Get Images #Here we actually take up to n words, by word tokenizing and then taking n strings = [" ".join(take_10(word_tokenize(x))) for x in strings] #Next we split into cahracter strings_ = [list(x) for x in strings] #self.strings=strings #self.strings_=strings_ #Then we convert to interger, 93 for symbols we dont know self.strings_int = [[ self.dictionary[x.lower()] if x.lower() in self.keys else 67 for x in m ] for m in strings_] #Then we get the lengths, we need for loss self.strings_len = [len(x) for x in self.strings_int] string_count = len(strings) #We can write it in a neat list comprehension, enough optimization for me haha self.image_list = [ np.expand_dims(np.array(FakeTextDataGenerator.generate(*j)), 0) for j in zip([i for i in range(0, string_count)], strings, [ self.fonts[random.randrange(0, len(self.fonts))] for _ in range(0, string_count) ], [self.format] * string_count, [self.extension] * string_count, [self.skew_angle] * string_count, [self.random_skew] * string_count, [self.blur] * string_count, [self.random_blur] * string_count, [self.background] * string_count, [self.distorsion] * string_count, [self.distorsion_orientation] * string_count, [self.handwritten] * string_count, [self.name_format] * string_count, [self.width] * string_count, [self.alignment] * string_count, [self.text_color] * string_count, [self.orientation] * string_count, [self.space_width] * string_count) ] self.seq = augmentations