Beispiel #1
0
def main():
    """
        Description: Main function
    """

    # Argument parsing
    args = parse_arguments()

    # Create the directory if it does not exist.
    try:
        os.makedirs(args.output_dir)
    except OSError as e:
        if e.errno != errno.EEXIST:
            raise

    # Creating word list
    if args.dict:
        lang_dict = []
        if os.path.isfile(args.dict):
            with open(args.dict, "r", encoding="utf8", errors="ignore") as d:
                lang_dict = [l for l in d.read().splitlines() if len(l) > 0]
        else:
            sys.exit("Cannot open dict")
    else:
        lang_dict = load_dict(args.language)
    with open('./trdg/dicts/en.txt', 'r', encoding='utf8',
              errors='ignore') as f:
        en_dict = [i for i in f.read().splitlines() if len(i) > 0]
    # Create font (path) list
    if args.font_dir:
        fonts = [
            os.path.join(args.font_dir, p) for p in os.listdir(args.font_dir)
            if os.path.splitext(p)[1] == ".ttf"
        ]
    elif args.font:
        if os.path.isfile(args.font):
            fonts = [args.font]
        else:
            sys.exit("Cannot open font")
    else:
        fonts = load_fonts(args.language)

    # Creating synthetic sentences (or word)
    strings = []

    if args.use_wikipedia:
        print('use_wikipedia')
        strings = create_strings_from_wikipedia(args.length, args.count,
                                                args.language)
    elif args.input_file != "":
        print('input_file')
        strings = create_strings_from_file(args.input_file, args.count)
    elif args.random_sequences:
        print('random_sequences')
        strings = create_strings_randomly(
            args.length,
            args.random,
            args.count,
            args.include_letters,
            args.include_numbers,
            args.include_symbols,
            args.language,
        )
        # Set a name format compatible with special characters automatically if they are used
        if args.include_symbols or True not in (
                args.include_letters,
                args.include_numbers,
                args.include_symbols,
        ):
            args.name_format = 2


#     else :
#         print('create_strings_from_dict')
#         strings = create_strings_from_dict(
#             args.length,
#             args.random,
#             args.count,
#             lang_dict
#             )
    else:
        print(make_my_strings)
        strings = make_my_strings(
            args.count,
            lang_dict,
            en_dict,
        )

    if args.language == "ar":
        from arabic_reshaper import ArabicReshaper

        arabic_reshaper = ArabicReshaper()
        strings = [
            " ".join([arabic_reshaper.reshape(w) for w in s.split(" ")[::-1]])
            for s in strings
        ]
    if args.case == "upper":
        strings = [x.upper() for x in strings]
    if args.case == "lower":
        strings = [x.lower() for x in strings]

    string_count = len(strings)

    p = Pool(args.thread_count)
    for _ in tqdm(
            p.imap_unordered(
                FakeTextDataGenerator.generate_from_tuple,
                zip(
                    [i for i in range(0, string_count)],
                    strings,
                    [
                        fonts[rnd.randrange(0, len(fonts))]
                        for _ in range(0, string_count)
                    ],
                    [args.output_dir] * string_count,
                    [args.format] * string_count,
                    [args.extension] * string_count,
                    [args.skew_angle] * string_count,
                    [args.random_skew] * string_count,
                    [args.blur] * string_count,
                    [args.random_blur] * string_count,
                    [args.background] * string_count,
                    [args.distorsion] * string_count,
                    [args.distorsion_orientation] * string_count,
                    [args.handwritten] * string_count,
                    [args.name_format] * string_count,
                    [args.width] * string_count,
                    [args.alignment] * string_count,
                    [args.text_color] * string_count,
                    [args.orientation] * string_count,
                    [args.space_width] * string_count,
                    [args.character_spacing] * string_count,
                    [args.margins] * string_count,
                    [args.fit] * string_count,
                    [args.output_mask] * string_count,
                    [args.word_split] * string_count,
                    [args.image_dir] * string_count,
                ),
            ),
            total=args.count,
    ):
        pass
    p.terminate()

    if args.name_format == 2:
        # Create file with filename-to-label connections
        with open(os.path.join(args.output_dir, "labels.txt"),
                  "w",
                  encoding="utf8") as f:
            for i in range(string_count):
                file_name = str(i) + "." + args.extension
                f.write("{}\t{}\n".format(file_name, strings[i]))
Beispiel #2
0
    def __getitem__(self, index):
        # We are actually only getting one item here obvoiusly!

        num_words = randint(5, 6)
        language = "en"
        count = 32
        skew_angle = 0
        random_skew = True
        blur = 0
        random_blur = True
        background = randint(0, 1)
        distorsion = randint(0, 2)
        distorsion_orientation = randint(0, 2)

        # as class function
        def take_10(x):

            if len(x) > num_words:
                return x[0:num_words]
            else:
                return x

                # if random.random()>0.8:

        #    self.width=random.randint(500,800)
        # else:
        width = -1

        alignment = random.randint(0, 2)

        strings = []
        # try:
        #    strings = create_strings_from_wikipedia(1, 1, "en")
        # except:
        strings = create_strings_randomly(num_words, False, 1,
                                          True, True, True, "en")

        strings = [" ".join(take_10(word_tokenize(x))) for x in strings]

        strings_ = [list(x) for x in strings]
        self.strings_int = [[self.dictionary[x.lower()] if x.lower() in self.keys else 67 for x in m] for m in strings_]
        self.strings_len = [len(x) for x in self.strings_int]
        string_count = len(strings)

        # What we do here is we space the words quite far appart.
        if random.random() > 0.8:
            width_scale = random.random() * 900
            space_width = width_scale / np.sum(self.strings_len)
        else:
            # width_scale=random.randint(50,100)/100
            # width_scale=1
            # space_width=width_scale/np.sum(self.strings_len)
            space_width = 2  # random.randint(50,100)/100

            if random.random() > 0.85 and np.max(self.strings_len) < 30:
                width = random.randint(500, 800)

        image_list = [np.expand_dims(np.array(FakeTextDataGenerator.generate(*j)), 0) for j in zip(
            [i for i in range(0, string_count)],
            strings,
            [self.fonts[random.randrange(0, len(self.fonts))] for _ in range(0, string_count)],
            [self.format] * string_count,
            [self.extension] * string_count,
            [skew_angle] * string_count,
            [random_skew] * string_count,
            [blur] * string_count,
            [random_blur] * string_count,
            [background] * string_count,
            [distorsion] * string_count,
            [distorsion_orientation] * string_count,
            [self.handwritten] * string_count,
            [self.name_format] * string_count,
            [width] * string_count,
            [alignment] * string_count,
            [self.text_color] * string_count,
            [self.orientation] * string_count,
            [space_width] * string_count)]

        X = image_list[0]
        y = self.strings_int[0]

        y_len = len(y)

        # Here we include some random horizontal lines cause they appear quite often in real life.
        if random.random() > 0.8:
            for j in range(random.randint(0, 3)):
                random_channel = random.randint(0, 2)
                random_h = random.randint(0, 31)
                random_w_s = random.randint(0, int(X.shape[2] / 2))
                random_w_e = random.randint(int(X.shape[2] / 2), int(X.shape[2]))

                X[0, random_h, random_w_s:random_w_e, random_channel] = random.randint(0, 255)

        if self.transform:
            X = self.seq.augment_images(X)

        # X=np.squeeze(X)

        # X=np.expand_dims(X,0)

        X = X / 255
        x_len = X.shape[2]

        return X, y, x_len, y_len
Beispiel #3
0
def main():
    """
        Description: Main function
    """

    # Argument parsing
    args = parse_arguments()

    # Create the directory if it does not exist.
    try:
        os.makedirs(args.output_dir)
    except OSError as e:
        if e.errno != errno.EEXIST:
            raise

    # Creating word list
    lang_dict = load_dict(args.language)

    # Create font (path) list
    fonts = load_fonts(args.language)

    # Creating synthetic sentences (or word)
    strings = []

    if args.use_wikipedia:
        strings = create_strings_from_wikipedia(args.length, args.count,
                                                args.language)
    elif args.input_file != '':
        strings = create_strings_from_file(args.input_file, args.count)
    elif args.random_sequences:
        strings = create_strings_randomly(args.length, args.random, args.count,
                                          args.include_letters,
                                          args.include_numbers,
                                          args.include_symbols, args.language)
        # Set a name format compatible with special characters automatically if they are used
        if args.include_symbols or True not in (args.include_letters,
                                                args.include_numbers,
                                                args.include_symbols):
            args.name_format = 2
    else:
        strings = create_strings_from_dict(args.length, args.random,
                                           args.count, lang_dict)

    string_count = len(strings)

    p = Pool(args.thread_count)
    for _ in tqdm(p.imap_unordered(
            FakeTextDataGenerator.generate_from_tuple,
            zip(
                [i for i in range(0, string_count)],
                strings,
                [
                    fonts[random.randrange(0, len(fonts))]
                    for _ in range(0, string_count)
                ],
                [args.output_dir] * string_count,
                [args.format] * string_count,
                [args.extension] * string_count,
                [args.skew_angle] * string_count,
                [args.random_skew] * string_count,
                [args.blur] * string_count,
                [args.random_blur] * string_count,
                [args.background] * string_count,
                [args.distorsion] * string_count,
                [args.distorsion_orientation] * string_count,
                [args.handwritten] * string_count,
                [args.name_format] * string_count,
            )),
                  total=args.count):
        pass
    p.terminate()

    if args.name_format == 2:
        # Create file with filename-to-label connections
        with open(os.path.join(args.output_dir, "labels.txt"),
                  'w',
                  encoding="utf8") as f:
            for i in range(string_count):
                file_name = str(i) + "." + args.extension
                f.write("{} {}\n".format(file_name, strings[i]))
Beispiel #4
0
def main():
    """
        Description: Main function
    """

    # Argument parsing
    args = parse_arguments()

    # Create the directory if it does not exist.
    try:
        os.makedirs(args.output_dir)
    except OSError as e:
        if e.errno != errno.EEXIST:
            raise

    # Creating word list
    lang_dict = load_dict(args.language)

    # Create font (path) list
    fonts = load_fonts(args.language)
    text_colors = [(255, 255, 255, 255),(255, 255, 0, 255)]

    bg_paths = glob.glob('/Users/yaqi.wang/projects/burned_in_captions/data/captions/background/*/*.png')
    bg_paths += glob.glob('/Users/yaqi.wang/projects/burned_in_captions/data/captions/background/*/*.bmp')
    print("len(bg_paths):",len(bg_paths))


    # Creating synthetic sentences (or word)
    strings = []
    caption_file_names = glob.glob(args.input_file+'/es/*captions.txt')
    for i_file,caption_file_name in enumerate(caption_file_names):
        print("pricessing {}th file".format(i_file))
        out_path = os.path.dirname(caption_file_name)
        if len(glob.glob(out_path+'/*.png')) ==100:
            print("pass!")
            continue
        # print(out_path)
        if args.use_wikipedia:
            strings = create_strings_from_wikipedia(args.length, args.count, args.language)
        elif args.input_file != '':
            strings = create_strings_from_file(caption_file_name, args.count)
        elif args.random_sequences:
            strings = create_strings_randomly(args.length, args.random, args.count,
                                              args.include_letters, args.include_numbers, args.include_symbols, args.language)
            # Set a name format compatible with special characters automatically if they are used
            if args.include_symbols or True not in (args.include_letters, args.include_numbers, args.include_symbols):
                args.name_format = 2
        else:
            print("create_strings_from_dict!")
            strings = create_strings_from_dict(args.length, args.random, args.count, lang_dict)


        string_count = len(strings)
        # string_count = len(fonts)
        sampled_image_paths = [random.choice(bg_paths) for _ in range(0, string_count)]
        p = Pool(args.thread_count)
        for _ in tqdm(p.imap_unordered(
            FakeTextDataGenerator.generate_from_tuple,
            zip(
                [i for i in range(0, string_count)],
                strings,
                # [fonts[random.randrange(0, len(fonts))] for _ in range(0, string_count)],
                [fonts[i%len(fonts)] for i in range(0, string_count)],
                [out_path] * string_count,
                [args.format] * string_count,
                [args.extension] * string_count,
                [args.skew_angle] * string_count,
                [args.random_skew] * string_count,
                [args.blur] * string_count,
                [args.random_blur] * string_count,
                [args.background] * string_count,
                [args.distorsion] * string_count,
                [args.distorsion_orientation] * string_count,
                [args.handwritten] * string_count,
                [args.name_format] * string_count,
                # [args.width] * string_count,
                [int(math.fabs(random.gauss(args.margin,2))) for _ in range(string_count)],
                # [args.alignment] * string_count,
                [random_pick([0,1,2],[0.2,0.6,0.2]) for _ in range(string_count)],
                # [args.text_color] * string_count,
                [random_pick(text_colors,[0.9,.1]) for _ in range(string_count)],
                [random_pick([0,1,2],[0.2,0.7,0.1]) for _ in range(string_count)],
                sampled_image_paths
                #contour_width

            )
        ), total=args.count):
            pass
        p.terminate()

        if args.name_format == 2:
            # Create file with filename-to-label connections

            with open(os.path.join(out_path, "labels.txt"), 'w', encoding="utf8") as f:
                for i in range(string_count):
                    file_name = str(i) + "." + args.extension
                    f.write("{} {}\n".format(file_name, strings[i]))
Beispiel #5
0
def main():
    """
        Description: Main function
    """

    # Argument parsing
    args_default = parse_arguments()

    # Create the directory if it does not exist.
    try:
        os.makedirs(args_default.output_dir)
    except OSError as e:
        if e.errno != errno.EEXIST:
            raise

    if args_default.tf_flag == 1:
        try:
            os.makedirs(args_default.tfr_output_dir)
        except OSError as e:
            if e.errno != errno.EEXIST:
                raise
    # Creating word list
    lang_dict = load_dict(args_default.language)
    print('lang_dict size -->' + str(len(lang_dict)))

    # Create font (path) list
    if not args_default.font:
        fonts = load_fonts(args_default.language)
    else:
        if os.path.isfile(args_default.font):
            fonts = [args_default.font]
        else:
            sys.exit("Cannot open font")

    # Creating synthetic sentences (or word)
    strings = []
    word_labels = []
    if args_default.use_wikipedia:
        strings = create_strings_from_wikipedia(args_default.length,
                                                args_default.count,
                                                args_default.language)
    elif args_default.input_file != '':
        strings, word_labels = create_strings_from_file(
            args_default.input_file, args_default.count, lang_dict)
    elif args_default.random_sequences:
        strings = create_strings_randomly(
            args_default.length, args_default.random, args_default.count,
            args_default.include_letters, args_default.include_numbers,
            args_default.include_symbols, args_default.language)
        # Set a name format compatible with special characters automatically if they are used
        if args_default.include_symbols or True not in (
                args_default.include_letters, args_default.include_numbers,
                args_default.include_symbols):
            args_default.name_format = 2
    else:
        strings, word_labels = create_strings_from_dict(
            args_default.length, args_default.random, args_default.count,
            lang_dict)

    string_count = len(strings)

    argses = argsesCreator(string_count, args_default)

    tr_portion = 0.9
    te_uplimit = 1000

    tr_st = 0
    tr_ed = np.max([int(string_count * tr_portion), string_count - te_uplimit])
    te_st = tr_ed + 1
    te_ed = string_count

    print('TOTAL DATA GENERATE: ' + str(string_count))
    print('TRAIN DATA: ' + str(tr_ed))
    print('TEST DATA: ' + str(string_count - tr_ed) + ' (' +
          str(int(10000 * (1 - (tr_ed + 1) / string_count)) / 100) + '%)')
    # Parallel mode.
    n_parallel = args_default.thread_count
    if n_parallel < 2:
        main_parallel(0, 1, fonts, argses[tr_st:tr_ed], strings[tr_st:tr_ed],
                      word_labels[tr_st:tr_ed], 'tran')
    else:
        p = Pool(processes=n_parallel)
        for pid in range(0, n_parallel):
            p.apply_async(main_parallel, (
                pid,
                n_parallel,
                fonts,
                argses[tr_st:tr_ed],
                strings[tr_st:tr_ed],
                word_labels[tr_st:tr_ed],
                'train',
            ))
        p.close()
        p.join()

    # Generate test data.
    main_parallel(0, 1, fonts, argses[te_st:te_ed], strings[te_st:te_ed],
                  word_labels[te_st:te_ed], 'test')

    print('Done!')
    if args_default.name_format == 2:
        # Create file with filename-to-label connections
        with open(os.path.join(args_default.output_dir, "labels.txt"),
                  'w',
                  encoding="utf8") as f:
            for i in range(string_count):
                file_name = str(i) + "." + args_default.extension
                f.write("{} {}\n".format(file_name, strings[i]))
Beispiel #6
0
def main():
    """
        Description: Main function
    """

    # Argument parsing
    args = parse_arguments()
    # Create the directory if it does not exist.
    try:
        if os.path.exists(args.output_dir) == True:
            shutil.rmtree(args.output_dir)
            os.makedirs(args.output_dir)
        else:
            os.makedirs(args.output_dir)
    except OSError as e:
        if e.errno != errno.EEXIST:
            raise

    # Creating word list
    lang_dict = load_dict(args.language)

    # Create font (path) list
    if not args.font:
        fonts = load_fonts(args.language)
    else:
        if os.path.isfile(args.font):
            fonts = [args.font]
        else:
            sys.exit("Cannot open font")

    # Creating synthetic sentences (or word)
    strings = []

    if args.use_wikipedia:
        strings = create_strings_from_wikipedia(args.length, args.count,
                                                args.language)
    elif args.input_file != '':
        strings = create_strings_from_file(args.input_file, args.count)
    elif args.random_sequences:
        strings = create_strings_randomly(args.length, args.random, args.count,
                                          args.include_letters,
                                          args.include_numbers,
                                          args.include_symbols, args.language)
        # Set a name format compatible with special characters automatically if they are used
        if args.include_symbols or True not in (args.include_letters,
                                                args.include_numbers,
                                                args.include_symbols):
            args.name_format = 2
    else:
        strings = create_strings_from_dict(args.length, args.random,
                                           args.count, lang_dict)

    string_count = len(strings)

    # Random BG color
    colorBGList = []

    backgroundList = []

    if args.background == 0 or args.background == 1 or args.background == 2 or args.background == 3:
        for i in range(args.count):
            colorBGList.append(args.background)
            backgroundList.append(args.background)
    elif args.background == 4:

        for i in range(args.count):
            if args.background_color_mode == "rndInList":
                colorBGList = RandomBackgroundColorInList(args.count)
                backgroundList.append(4)
            elif args.background_color_mode == "rnd":
                colorBGList = RandomBackgroundColor(args.count)
                backgroundList.append(4)

    elif args.background == 5:

        for i in range(args.count):
            args.background = random.randint(0, 4)

            if args.background == 4:
                args.background = 4
                if args.background_color_mode == "rndInList":
                    colorBGList.append(colorList[random.randint(
                        0,
                        len(colorList) - 1)])
                    backgroundList.append(4)

                elif args.background_color_mode == "rnd":
                    colorBGList.append(
                        (random.randint(0, 255), random.randint(0, 255),
                         random.randint(0, 255), 1))
                    backgroundList.append(4)
            else:
                rndBackground = random.randint(0, 3)
                colorBGList.append(rndBackground)
                backgroundList.append(rndBackground)

    # Random text color
    if args.text_color == 'rndInList':
        colorTextList = RandomTextColorInList(args.count)
    elif args.text_color == 'rnd':
        colorTextList = RandomTextColor(args.count)
    else:
        colorTextList = []
        for i in range(args.count):
            colorTextList.append(args.text_color)

    # Random font
    fontList = []
    for i in range(args.count):
        fontList.append(fonts[random.randrange(0, len(fonts) - 1)])

    # Distorsion list
    distorsionList = []
    if args.distorsion == 3:
        for i in range(args.count):
            distorsionList.append(random.randint(0, 2))
    else:
        for i in range(args.count):
            distorsionList.append(args.distorsion)

    # Skew & Blur list
    blurList = []
    skewList = []
    if args.random_blur_and_skew == True:
        for i in range(args.count):
            blurList.append(random.choice([True, False]))
            skewList.append(random.choice([True, False]))
    else:
        for i in range(args.count):
            blurList.append(args.random_blur)
            blurList.append(args.random_skew)

    # Font size List
    fontSizeList = []
    ratioList = []

    for i in range(args.count):
        if args.random_font_size == True:
            ratio = percentRatioList[random.randint(
                0,
                len(percentRatioList) -
                1)]  # random.randint(0,len(percentRatioList)-1)
            fontSizeList.append(ratio)
            ratioList.append(int(ratio))
        elif args.random_font_size == False:
            fontSizeList.append(int(args.format))
            ratioList.append(int(args.format / args.format * 100))

    p = Pool(args.thread_count)
    for _ in tqdm(p.imap_unordered(
            FakeTextDataGenerator.generate_from_tuple,
            zip([i for i in range(0, string_count)], strings,
                [e for e in fontList], [args.output_dir] * string_count,
                [args.format] * string_count, [args.extension] * string_count,
                [args.skew_angle] * string_count, [e for e in skewList],
                [args.blur] * string_count, [e for e in blurList],
                [e for e in backgroundList], [e for e in distorsionList],
                [args.distorsion_orientation] * string_count,
                [args.handwritten] * string_count,
                [args.name_format] * string_count, [args.width] * string_count,
                [args.alignment] * string_count, [e for e in colorTextList] *
                string_count, [args.orientation] * string_count,
                [args.space_width] * string_count,
                [args.margins] * string_count, [args.fit] * string_count,
                [e for e in colorBGList], [e for e in fontSizeList])),
                  total=args.count):
        pass
    p.terminate()

    if args.name_format == 2:
        # Create file with filename-to-label connections
        with open(os.path.join(args.output_dir, "labels.txt"),
                  'w',
                  encoding="utf-8") as f:
            for i in range(string_count):
                file_name = str(i) + "." + args.extension
                f.write("{} {}\n".format(file_name, strings[i]))
                print(strings[i])

    elif args.name_format == 3:
        from PIL import Image
        dataframe = []
        for i in range(args.count):
            im = Image.open(args.output_dir + str(i) + '.jpg')
            fontName = fontList[i]

            if colorBGList[i] == 0:
                colorBGList[i] = "gaussian noise"
            elif colorBGList[i] == 1:
                colorBGList[i] = "plain white"
            elif colorBGList[i] == 2:
                colorBGList[i] = "quasicrystal"
            elif colorBGList[i] == 3:
                colorBGList[i] = "picture"
            elif type(colorBGList[i]) == type((0, 0, 0, 0)):
                colorBGList[i] = "color background " + str(colorBGList[i])

            if distorsionList[i] == 0:
                distorsionList[i] = "None (Default)"
            elif distorsionList[i] == 1:
                distorsionList[i] = "Sine wave"
            elif distorsionList[i] == 2:
                distorsionList[i] = "Cosine wave"

            tupleData = (i, strings[i], fontName[10:], str(ratioList[i]) + "%",
                         colorTextList[i], colorBGList[i], im.size,
                         distorsionList[i], blurList[i], skewList[i])

            dataframe.append(tupleData)
            print(tupleData)

        CreateReport(dataframe)
Beispiel #7
0
def main():
    """
        Description: Main function
    """

    # Argument parsing
    args = parse_arguments()

    # Create the directory if it does not exist.
    try:
        os.makedirs(args.output_dir)
    except OSError as e:
        if e.errno != errno.EEXIST:
            raise

    # Creating word list
    lang_dict = load_dict(args.language)

    # Create font (path) list
    if not args.font:
        fonts = load_fonts(args.language)
    else:
        if os.path.isfile(args.font):
            fonts = [args.font]
        else:
            sys.exit("Cannot open font")

    # Creating synthetic sentences (or word)
    strings, targets = [], []
    random_sentence, orig = [], []
    inp = 'n'

    if args.use_wikipedia:
        strings = create_strings_from_wikipedia(args.length, args.count, args.language)
        print("sourcing from random wiki page...", args.use_wikipedia)
        input("Press enter...")
    elif args.input_file != '':
        strings = create_strings_from_file(args.input_file, args.count)
        print("sourcing from input file: ", args.input_file)

        print(args.language)
        if args.language == 'cn':

            KanjiArray, StrokeArray = [], []
            for strobj in strings:
                kanji, stroke = strobj.split(",")
                KanjiArray.append(kanji)
                StrokeArray.append(stroke)

            strings = KanjiArray[:]
            print("entered ", KanjiArray[len(KanjiArray)-1], strings[len(KanjiArray)-1])

        inp = input("NLP test? (y/n) ")
        if inp is 'y':
            # file = os.path.join(os.getcwd(), 'Wordcount_record.csv')

            print('Starting NLP-test' + os.linesep + '    ** generating sentences...')

            strings, random_sentence, targets = generate_random_get_targets(strings)

            print(strings[0] + targets[0])
            print(random_sentence[0] + targets[0])
            print('    ** generating random sentences and retrieving targets...')
            string_last = len(strings)

            input("Press enter to generate images...")

    elif args.random_sequences:
        strings = create_strings_randomly(args.length, args.random, args.count,
                                          args.include_letters, args.include_numbers, args.include_symbols,
                                          args.language)
        # Set a name format compatible with special characters automatically if they are used
        if args.include_symbols or True not in (args.include_letters, args.include_numbers, args.include_symbols):
            args.name_format = 2
    else:
        strings = create_strings_from_dict(args.length, args.random, args.count, lang_dict)

    if args.case == 'upper':
        strings = [x.upper() for x in strings]
    if args.case == 'lower':
        strings = [x.lower() for x in strings]

    string_count = len(strings)

    # Store random values used at FakeTextDataGenerator Class
    RandNums = []

    #  Apply effect onto targets first then clean args
    if inp is 'y':
        orig = strings
        strings = targets
        args.output_dir = "out_target/"
        args.margins = (5, 0, 5, 5)
        args.name_format = 2
        try:
            os.makedirs(args.output_dir)
        except OSError as e:
            if e.errno != errno.EEXIST:
                raise

        print('    ** applying effects onto targets...')

    p = Pool(args.thread_count)
    for _ in tqdm(p.imap_unordered(
            FakeTextDataGenerator.generate_from_tuple,
            zip(
                [i for i in range(0, string_count)],
                strings,
                [fonts[rnd.randrange(0, len(fonts))] for _ in range(0, string_count)],
                # [determine_out_dir(i, string_last, args.output_dir) for i in range(0, string_count)],
                [args.output_dir] * string_count,
                [args.format] * string_count,
                [args.extension] * string_count,
                [args.skew_angle] * string_count,
                [args.random_skew] * string_count,
                [args.blur] * string_count,
                [args.random_blur] * string_count,
                [args.background] * string_count,
                [args.distorsion] * string_count,
                [args.distorsion_orientation] * string_count,
                [args.handwritten] * string_count,
                [args.name_format] * string_count,
                [args.width] * string_count,
                [args.alignment] * string_count,
                [args.text_color] * string_count,
                [args.orientation] * string_count,
                [args.space_width] * string_count,
                [args.margins] * string_count,
                [args.fit] * string_count
            )
    ), total=args.count):
        RandNums.append(_)
        pass
    p.terminate()

    if inp is 'y':
        strings = orig
        print('    ** generating clean images for sentences without target...')
        args.output_dir = "out/"
        args.margins = (5, 5, 5, 0)

        # store and reset effect variables at args
        orig_blur = args.blur
        orig_random_blur = args.random_blur
        args.blur = 0
        args.background = 1
        args.random_blur = False

        p = Pool(args.thread_count)
        for _ in tqdm(p.imap_unordered(
                FakeTextDataGenerator.generate_from_tuple,
                zip(
                    [i for i in range(0, string_count)],
                    strings,
                    [fonts[rnd.randrange(0, len(fonts))] for _ in range(0, string_count)],
                    # [determine_out_dir(i, string_last, args.output_dir) for i in range(0, string_count)],
                    [args.output_dir] * string_count,
                    [args.format] * string_count,
                    [args.extension] * string_count,
                    [args.skew_angle] * string_count,
                    [args.random_skew] * string_count,
                    [args.blur] * string_count,
                    [args.random_blur] * string_count,
                    [args.background] * string_count,
                    [args.distorsion] * string_count,
                    [args.distorsion_orientation] * string_count,
                    [args.handwritten] * string_count,
                    [args.name_format] * string_count,
                    [args.width] * string_count,
                    [args.alignment] * string_count,
                    [args.text_color] * string_count,
                    [args.orientation] * string_count,
                    [args.space_width] * string_count,
                    [args.margins] * string_count,
                    [args.fit] * string_count
                )
        ), total=args.count):
            # RandNums.append(_)
            pass
        p.terminate()

        rand_sent = random_sentence[:]
        strings = random_sentence
        print('    ** generating clean images out of random sentences...')
        args.output_dir = "out_random/"
        try:
            os.makedirs(args.output_dir)
        except OSError as e:
            if e.errno != errno.EEXIST:
                raise

        p = Pool(args.thread_count)
        for _ in tqdm(p.imap_unordered(
                FakeTextDataGenerator.generate_from_tuple,
                zip(
                    [i for i in range(0, string_count)],
                    strings,
                    [fonts[rnd.randrange(0, len(fonts))] for _ in range(0, string_count)],
                    # [determine_out_dir(i, string_last, args.output_dir) for i in range(0, string_count)],
                    [args.output_dir] * string_count,
                    [args.format] * string_count,
                    [args.extension] * string_count,
                    [args.skew_angle] * string_count,
                    [args.random_skew] * string_count,
                    [args.blur] * string_count,
                    [args.random_blur] * string_count,
                    [args.background] * string_count,
                    [args.distorsion] * string_count,
                    [args.distorsion_orientation] * string_count,
                    [args.handwritten] * string_count,
                    [args.name_format] * string_count,
                    [args.width] * string_count,
                    [args.alignment] * string_count,
                    [args.text_color] * string_count,
                    [args.orientation] * string_count,
                    [args.space_width] * string_count,
                    [args.margins] * string_count,
                    [args.fit] * string_count
                )
        ), total=args.count):
            # RandNums.append(_)
            pass
        p.terminate()

        args.blur = orig_blur
        args.random_blur = orig_random_blur

        # concatenate images and go back to original string
        concatenate_images()
        for index in range(len(strings)):
            rand_sent[index] += targets[index]
            strings[index] = orig[index] + targets[index]

    # print(RandNums, os.linesep)

    if args.name_format == 2:
        # Create file with filename-to-label connections
        with open(os.path.join(args.output_dir, "labels.txt"), 'w', encoding="utf-8_sig") as f:
            for i in range(string_count):
                file_name = str(i) + "." + args.extension
                f.write("{} {}\n".format(file_name, strings[i]))

    # Write to Csv
    tag_array = []
    for arg in vars(args):
        tag_array.append(arg)
    tag_array.insert(0, "Content")
    tag_array.insert(2, "File name")
    if args.language == 'cn':
        tag_array.insert(3, "Num of strokes")

    fileout = 'PyCsvExp.csv'
    with open(fileout, 'w', encoding="utf-8_sig") as csvFile:
        writer = csv.writer(csvFile)
        writer.writerow(tag_array)

        row_keeper = []
        for i in range(string_count):
            attr_array = []

            # retrieve rand from class
            # print(FakeTextDataGenerator.generate_from_tuple())

            for arg in vars(args):
                if arg is 'blur':
                    if args.random_blur is False:
                        RandNums[i][0] = args.blur
                    attr_array.append(RandNums[i][0])
                elif arg is 'length':
                    attr_array.append(len(strings[i]))
                else:
                    attr_array.append(getattr(args, arg))
                # print(arg, getattr(args, arg))
            attr_array.insert(0, strings[i])
            if inp is 'y':
                attr_array.insert(2, "concat_" + RandNums[i][2])
            else:
                attr_array.insert(2, RandNums[i][2])

            if args.language == 'cn':
                attr_array.insert(3, StrokeArray[i])

            # writer.writerow(attr_array)
            row_keeper.append(attr_array)

        if inp is 'y':
            sorted_rows = sorted(row_keeper, key=lambda row: int(row[2][:-4].strip("concat_")),
                                 reverse=False)  # row[0].lower()
        else:
            sorted_rows = sorted(row_keeper, key=lambda row: int(row[2].strip(".jpg")),
                                 reverse=False)  # row[0].lower()

        writer.writerows(sorted_rows)
        csvFile.close()

    if inp is 'y':
        # Write to Csv
        tag_array = []
        for arg in vars(args):
            tag_array.append(arg)
        tag_array.insert(0, "Content")
        tag_array.insert(2, "file name")

        fileout = 'PyCsvExpRandom.csv'
        with open(fileout, 'w', encoding="utf-8_sig") as csvFile:
            writer = csv.writer(csvFile)
            writer.writerow(tag_array)

            row_keeper = []
            for i in range(string_count):
                attr_array = []

                # retrieve rand from class
                # print(FakeTextDataGenerator.generate_from_tuple())

                for arg in vars(args):
                    if arg is 'blur':
                        if args.random_blur is False:
                            RandNums[i][0] = args.blur
                        attr_array.append(RandNums[i][0])
                    elif arg is 'length':
                        attr_array.append(len(rand_sent[i]))
                    else:
                        attr_array.append(getattr(args, arg))
                    # print(arg, getattr(args, arg))
                attr_array.insert(0, rand_sent[i])
                attr_array.insert(2, "concat_random_" + RandNums[i][2])
                # writer.writerow(attr_array)
                row_keeper.append(attr_array)

            sorted_rows = sorted(row_keeper, key=lambda row: int(row[2][:-4].strip("concat_random_")), reverse=False)  # row[0].lower()
            writer.writerows(sorted_rows)
            csvFile.close()
Beispiel #8
0
    def __init__(self,
                 batch_size,
                 epoch_size=10,
                 random_strings=True,
                 num_words=5,
                 transform=False,
                 width=-1,
                 alignment=1,
                 height=32):
        'Initialization'
        #General args:
        self.transform = transform
        self.random_strings = random_strings
        self.num_words = num_words
        #How much data we want to generate in a single epoch
        self.epoch_size = epoch_size
        self.batch_size = batch_size

        def take_10(x):

            if len(x) > self.num_words:
                return x[0:self.num_words]
            else:
                return x

        #Text gen specific stuff
        self.thread_count = 6
        self.language = "en"
        self.count = 32
        #If we want to use random seq, alternatively we use Wikipedia ( needs internet)
        self.random_sequences = False
        self.include_letters = True
        self.include_numbers = True
        self.include_symbols = True
        #When source wikipedia, how many words we want to include
        self.length = 10
        #If we want to have variable word lengths, "length is maximum)
        self.random = True
        #The height of the image
        self.format = height
        #Skeqing angle
        self.skew_angle = 0
        self.random_skew = True
        self.use_wikipedia = True
        self.blur = 0
        self.random_blur = True
        """Define what kind of background to use. 0: Gaussian Noise, 1: Plain white, 2: Quasicrystal, 3: Pictures""",
        self.background = 1
        """Define a distorsion applied to the resulting image. 0: None (Default), 1: Sine wave, 2: Cosine wave, 3: Random""",

        self.distorsion = 0
        """Define the distorsion's orientation. Only used if -d is specified. 0: Vertical (Up and down), 1: Horizontal (Left and Right), 2: Both"""
        self.distorsion_orientation = 0
        self.width = width
        self.orientation = 0
        self.text_color = '#282828'
        self.space_width = 0.1
        self.extension = "jpg"
        self.handwritten = False
        self.name_format = 0
        self.alignment = alignment

        #This shoule be done on init We also do string generation in the init of the dataset
        pool = ''
        pool += "abcdefghijklmnopqrstuvwxyz"
        pool += "0123456789"
        pool += "!\"#$%&'()*+,-./:;?@[\\]^_`{|}~"
        pool += ' '
        self.keys = list(pool)
        self.values = np.array(range(1, len(pool) + 1))
        self.dictionary = dict(zip(self.keys, self.values))
        self.fonts = load_fonts("en")

        self.decode_dict = dict((v, k) for k, v in self.dictionary.items())
        self.decode_dict.update({67: "OOK"})

        ###Get strings
        strings = []
        #We try to load strings from wikipedia, if not we return random strings.
        if self.random_strings == False:
            try:
                #Num words just doesnt work here I think it takes one sentence always
                strings = create_strings_from_wikipedia(
                    1, self.batch_size * self.epoch_size, "en")
            except:
                print("Connection issues")
                strings = create_strings_randomly(self.num_words, False,
                                                  batch_size * epoch_size,
                                                  True, True, True, "en")
        else:
            strings = create_strings_randomly(self.num_words, False,
                                              batch_size * epoch_size, True,
                                              True, True, "en")

        ###Get Images
        #Here we actually take up to n words, by word tokenizing and then taking n
        strings = [" ".join(take_10(word_tokenize(x))) for x in strings]
        #Next we split into cahracter
        strings_ = [list(x) for x in strings]
        #self.strings=strings
        #self.strings_=strings_
        #Then we convert to interger, 93 for symbols we dont know
        self.strings_int = [[
            self.dictionary[x.lower()] if x.lower() in self.keys else 67
            for x in m
        ] for m in strings_]
        #Then we get the lengths, we need for loss
        self.strings_len = [len(x) for x in self.strings_int]
        string_count = len(strings)
        #We can write it in a neat list comprehension, enough optimization for me haha
        self.image_list = [
            np.expand_dims(np.array(FakeTextDataGenerator.generate(*j)), 0)
            for j in zip([i for i in range(0, string_count)], strings, [
                self.fonts[random.randrange(0, len(self.fonts))]
                for _ in range(0, string_count)
            ], [self.format] * string_count, [self.extension] *
                         string_count, [self.skew_angle] *
                         string_count, [self.random_skew] *
                         string_count, [self.blur] *
                         string_count, [self.random_blur] *
                         string_count, [self.background] *
                         string_count, [self.distorsion] *
                         string_count, [self.distorsion_orientation] *
                         string_count, [self.handwritten] *
                         string_count, [self.name_format] *
                         string_count, [self.width] *
                         string_count, [self.alignment] *
                         string_count, [self.text_color] *
                         string_count, [self.orientation] *
                         string_count, [self.space_width] * string_count)
        ]

        self.seq = augmentations