Beispiel #1
0
 def test_Parser_data(self):
     with open(os.path.join(os.path.dirname(sys.argv[0]),
                            "test_parser.txt")) as fr:
         p = Parser(fr)
         with open(
                 os.path.join(os.path.dirname(sys.argv[0]),
                              "test_parser_dataset.txt"), "w") as fw:
             p.get_data(fw)
Beispiel #2
0
    def __init__(self, data_path):
        self._images_path = os.path.join(data_path, 'images')
        self._parser = Parser(data_path)
        self._results = MdUtils(file_name='results', title='Overview')

        self._num_images = None
        self._counts = None
        self._image_shape = None
        self._image_shape_mean = None
Beispiel #3
0
 def test_Parser_stats(self):
     with open(os.path.join(os.path.dirname(sys.argv[0]),
                            "test_parser.txt")) as fr:
         p = Parser(fr)
         with open(
                 os.path.join(os.path.dirname(sys.argv[0]),
                              "test_parser_result.txt"), "w") as fw:
             p.get_stats(fw)
     result = os.path.join(os.path.dirname(sys.argv[0]),
                           "test_parser_result.txt")
     compare = os.path.join(os.path.dirname(sys.argv[0]),
                            "test_parser_compare.txt")
     with open(result) as fresult:
         with open(compare) as fcompare:
             for lineresult in fresult:
                 linecompare = fcompare.readline()
Beispiel #4
0
class Splitter(object):
    def __init__(self, data_path, split_ratio):
        self._images_path = os.path.join(data_path, 'images')
        self._out_path = os.path.join(data_path, 'images-split')
        self._split_ratio = split_ratio
        self._parser = Parser(data_path)

    def create_directories(self, labels):
        shutil.rmtree(self._out_path, True)
        os.makedirs(self._out_path)
        for d in ['test', 'train']:
            for label in labels:
                os.makedirs(os.path.join(self._out_path, d, label))

    def get_train_test_image_list(self, image_list):
        np.random.shuffle(image_list)
        train_images, test_images = np.split(np.array(image_list),
                                             [int(len(image_list) * self._split_ratio), ])

        return train_images, test_images

    def copy_images(self, images, base_dir, label):
        for image in images:
            src = os.path.join(self._images_path, image)
            dst = os.path.join(self._out_path, base_dir, label, image)
            if os.path.isfile(src):
                print(f'Copying {src} to {dst}')
                shutil.copy(src, dst)

    def split(self):
        self._parser.parse()
        self.create_directories(self._parser.labels)

        for label in self._parser.labels:
            train_images, test_images = self.get_train_test_image_list(self._parser.get_label_images(label))
            self.copy_images(train_images, 'train', label)
            self.copy_images(test_images, 'test', label)

    def split_binary(self):
        self._parser.parse()
        self.create_directories(self._parser.binary_labels)

        train_images, test_images = self.get_train_test_image_list(self._parser.get_no_anomaly_images())
        self.copy_images(train_images, 'train', 'No-Anomaly')
        self.copy_images(test_images, 'test', 'No-Anomaly')

        train_images, test_images = self.get_train_test_image_list(self._parser.get_anomaly_images())
        self.copy_images(train_images, 'train', 'Anomaly')
        self.copy_images(test_images, 'test', 'Anomaly')
Beispiel #5
0
    def init(cls):
        # set font config
        cls.fontconfig = Config.font_vars["large"]

        cls.parser = Parser()
        cls.parser.parse("questions.json")
        pygame.init()
        cls.screen = pygame.display.set_mode(
            (Config.display_width, Config.display_height))

        # display cards
        cls.screen.fill(Config.niceblue)
        cls.font = pygame.font.Font(Config.font_setting[0], cls.fontconfig[0])
        pools = cls.parser.get_pools()

        cls.cardboard = Cardboard(cls.switch_menu)
        cls.menu = Menu(cls.screen, Config.font_setting[0], pools,
                        cls.switch_cardboard)
        pygame.display.flip()

        cls.main()
Beispiel #6
0
 def __init__(self, data_path, split_ratio):
     self._images_path = os.path.join(data_path, 'images')
     self._out_path = os.path.join(data_path, 'images-split')
     self._split_ratio = split_ratio
     self._parser = Parser(data_path)
Beispiel #7
0
    ENV = {'mode': args.mode, 'ip': args.ip, 'port': args.port}
"""
Priv Publish server
"""

UPLOAD_FOLDER = os.environ['PRIV_DATA']
ALLOWED_EXTENSIONS = set(['txt', 'csv'])
#DATA_HOME = os.environ['PRIV_DATA']

app = Flask(__name__,
            template_folder="templates",
            static_folder='templates/components')
app.config['UPLOAD_FOLDER'] = UPLOAD_FOLDER
app.config['MAX_CONTENT_LENGTH'] = 8 * 1024 * 1024  # 8MB
app.secret_key = "pRiV-PuBLish"
pd_parser = Parser()


# home page
@app.route("/", methods=["GET"])
def home():
    return render_template('homepage.html')


# synthetic page
@app.route("/data_generation", methods=["GET"])
def data_generation():
    prod_ip = ENV['ip']
    return render_template('data_generation.html', prod_ip=prod_ip)

Beispiel #8
0
class DataAnalyzer(object):
    def __init__(self, data_path):
        self._images_path = os.path.join(data_path, 'images')
        self._parser = Parser(data_path)
        self._results = MdUtils(file_name='results', title='Overview')

        self._num_images = None
        self._counts = None
        self._image_shape = None
        self._image_shape_mean = None

    def _compute_stats(self):
        self._num_images = self._parser.data.shape[1]
        self._counts = self._parser.data.loc['anomaly_class'].value_counts(
        ).to_dict()

    def _plot_random_image(self):
        random_image_file = f'{random.randint(0, self._num_images)}.jpg'
        image = img.imread(os.path.join(self._images_path, random_image_file))
        self._image_shape = image.shape

        fig = plt.figure()
        plt.tight_layout()
        plt.imshow(image)
        plt.xticks([]), plt.yticks([])
        plt.tight_layout()
        plt.title('Random Image')
        plt.savefig('random_image.png')
        plt.close(fig)

        plt.figure()
        plt.tight_layout()
        plt.hist(image.flatten())
        plt.xlabel('Pixel Value')
        plt.ylabel('Counts')
        plt.title('Histogram')
        plt.savefig('random_image_histogram.png')
        plt.close(fig)

    def _compute_mean_shape(self):
        h, w = [], []
        for im in self._parser.image_list:
            print(f'Reading image: {im}')
            image = img.imread(os.path.join(self._images_path, im))
            h.append(image.shape[0])
            w.append(image.shape[1])
        self._image_shape_mean = (np.mean(h), np.mean(w))

    def _plot_image_each_class(self):
        fig = plt.figure(figsize=(10, 10))
        plt.tight_layout()
        plt.title('Random Image In Each Class')
        i = 1
        for label in self._parser.labels:
            image_list = self._parser.get_label_images(label)
            random_selection = random.choice(image_list)
            image_path = self._parser.data.loc['image_filepath'].tolist(
            )[random_selection]
            image = img.imread(os.path.join(self._images_path, image_path[7:]))
            plt.subplot(4, 3, i)
            i += 1
            plt.subplots_adjust(hspace=1, wspace=1)
            plt.title(f'Class: {label}')
            cur_axes = plt.gca()
            cur_axes.axes.get_xaxis().set_ticks([])
            cur_axes.axes.get_yaxis().set_ticks([])
            plt.imshow(np.uint8(image))

        plt.savefig('random_image_each_class.png')
        plt.close(fig)

    def analyze(self):
        self._parser.parse()
        self._compute_stats()
        self._plot_random_image()
        self._compute_mean_shape()
        self._plot_image_each_class()

    def save_results(self):
        self._results.new_paragraph(f'Number of images: {self._num_images}')
        self._results.new_paragraph(
            f'Number of unique classes: {len(self._parser.labels)}')
        self._results.new_paragraph(f'Class names:')
        self._results.new_list(items=self._parser.labels)
        self._results.new_paragraph(f'Number of images per class: ')
        self._results.new_list(
            items=[f'{k}: {v}' for k, v in self._counts.items()])
        self._results.new_paragraph(f'Image shape: {self._image_shape}')
        self._results.new_paragraph(
            f'Mean Image shape: {self._image_shape_mean}')
        self._results.new_paragraph(
            self._results.new_inline_image(text='Random Image',
                                           path='random_image.png'))
        self._results.new_paragraph(
            self._results.new_inline_image(text='Histogram',
                                           path='random_image_histogram.png'))
        self._results.new_paragraph(
            self._results.new_inline_image(text='Classes',
                                           path='random_image_each_class.png'))
        self._results.create_md_file()
Beispiel #9
0
def main():
    argdict = dict(zip(sys.argv, sys.argv[1:] + ['']))
    if "-h" in argdict:
        print(help_message)
        return
    
    #Set of filenames to data files.
    raw_filename = join_filenames("data", "tweets.csv")
    filtered_filename = join_filenames("data", "_tweets_filtered.txt")
    stat_filename = join_filenames("data", "tweets_stat.txt")
    tokenized_filename = join_filenames("data", "tweets_tokenized.txt")
    
    #Dimension of the model
    session_config = configparser.ConfigParser()
    session_config.read('session.ini')

    word2vec_batch_size = 640
    embedding_size = int(session_config['dimension']['embedding_size'])
    gen_batch_size = 128
    gen_seq_length = int(session_config['dimension']['gen_seq_length'])
    gen_hidden_size = [int(x) for x in session_config['dimension']['gen_hidden_size'].split(',')]

    #Hyper-parameter of the model
    learning_rate = 1E-06
    

    if "-i" in argdict:
        #Filter valid tweets from data file, and use nlp parser to tokenize tweets
        if os.path.isfile(tokenized_filename):
            proceed = (input("Erasing old data. OK to proceed? (Y/N)") == "Y")
        else:
            proceed = True
        if proceed:
            with open_utf8(raw_filename, "r") as raw_file_r:
                #Filter actual tweets
                preparser = Preparser(raw_file_r)
                preparser.extract(filter=True)
                with open_utf8(filtered_filename, "w") as filtered_file_w:
                    preparser.save(filtered_file_w)
                
                #Tokenize tweets
                with open_utf8(filtered_filename, "r") as filtered_file_r:
                    parser = Parser(filtered_file_r)
                    with open_utf8(stat_filename, "w") as stat_file_w:
                        parser.get_stats(stat_file_w)
                    with open_utf8(tokenized_filename, "w") as tokenized_file_w:
                        parser.get_data(tokenized_file_w)
                
    if "-w" in argdict and int(argdict["-w"]) >= 0:
        #Start or continue word2vec optimization
        word2vec_num_step = int(argdict["-w"])
        if "-W" in argdict:
            word2vec_save_filename = join_filenames("saves", argdict["-W"])
        else:
            word2vec_save_filename = join_filenames(
                "saves", session_config['save_file']['word2vec_save'])
        word2vec_restore = os.path.isfile(word2vec_save_filename+".meta")
        
        word2vec = Word2Vec(tokenized_filename, stat_filename)
        word2vec.give_code()
        word2vec.tf_init(embedding_size=embedding_size,
                         batch_size=word2vec_batch_size, seed=None)
        word2vec.tf_run(word2vec_num_step, word2vec_save_filename, restore=word2vec_restore)
        
        if "-g" in argdict and int(argdict["-g"]) >= 0:
        #Start or continue generator learning
            with open_utf8(stat_filename, "r") as stat_file_r, open_utf8(tokenized_filename, "r") as tokenized_file_r:
                embeddings = word2vec.Embeddings()
                if "-G" in argdict:
                    gen_save_filename = join_filenames("saves", argdict["-G"])
                else:
                    gen_save_filename = join_filenames(
                        "saves", session_config['save_file']['generator_save'])
                gen_restore = os.path.isfile(gen_save_filename+".meta")
                generator = Generator(embeddings)
                generator.nn_init(
                    gen_batch_size, gen_seq_length, gen_hidden_size,
                    learning_rate=learning_rate, seed=None,
                    use_vector=("-V" in argdict))
                generator.train_real_data(int(argdict["-g"]), tokenized_file_r,
                    gen_save_filename, restore=gen_restore)
                
                if "-s" in argdict and int(argdict["-s"]) >= 0:
                    result_filename = join_filenames(argdict["-S"])
                    unparser = Unparser(result_filename)
                    sentences = generator.generate(gen_save_filename,
                                                   int(argdict["-s"]))
                    unparser.save(sentences)
#!/usr/bin/python
import os, sys
from data.parser import Parser

sys.path.append(os.path.dirname(__file__))

data_filename='news_tagged_data.txt'

if __name__ == "__main__":
    parser = Parser(data_filename)
    X,Y = parser.parse()
    print X
    print Y