Beispiel #1
0
    def test_export_to_csv_from_reader_001(self):
        self.blogger_corpus()
        self.prj_folder()
        self.blogger_lists()
        reader = Reader(os.path.join(self.tempdir_blogger_corp,
                                     self.txt_blogger_hightrepetativ_set),
                        "txt",
                        send_end_file_marker=False,
                        regex_template="blogger",
                        mode=self.mode)
        exporter = Exporter(reader.getlazy(), mode=self.mode)

        exporter.tocsv(self.tempdir_project_folder,
                       "blogger_corpus",
                       self.fieldnames,
                       rows_limit_in_file=1)

        i = 0
        for item in os.listdir(self.tempdir_project_folder):
            if ".csv" in item:
                i += 1

        #p(list(reader.getlazy()))
        if len(list(reader.getlazy())) != i:
            assert False
    def test_getlazy_many_streams_from_txt_with_given_number_of_streams_without_adjust_for_current_cpu_521(
            self):
        self.blogger_corpus()
        end_file_marker = -1
        reader = Reader(os.path.join(self.tempdir_blogger_corp,
                                     self.txt_blogger_small_fake_set),
                        "txt",
                        regex_template="blogger",
                        mode=self.mode,
                        end_file_marker=end_file_marker,
                        send_end_file_marker=True)
        number_of_found_files = reader._get_number_of_left_over_files()
        #p(number_of_found_files)

        # Check for stream_number=3
        len(reader.getlazy(stream_number=3,
                           adjust_to_cpu=False)).should.be.equal(3)
        len([
            rowdict
            for gen in reader.getlazy(stream_number=3, adjust_to_cpu=False)
            for rowdict in gen if end_file_marker == rowdict
        ]).should.be.equal(number_of_found_files)

        # Check for stream_number=2
        #p(reader.getlazy(stream_number=2, adjust_to_cpu=False))
        len(reader.getlazy(stream_number=2,
                           adjust_to_cpu=False)).should.be.equal(2)
        len([
            rowdict
            for gen in reader.getlazy(stream_number=2, adjust_to_cpu=False)
            for rowdict in gen if end_file_marker == rowdict
        ]).should.be.equal(number_of_found_files)

        i = 0
        for gen, fname in zip(
                reader.getlazy(stream_number=3,
                               adjust_to_cpu=False,
                               min_files_pro_stream=1),
                reversed(reader.files_to_read_orig)):
            for row_dict in gen:
                if row_dict == end_file_marker:
                    i += 1
                    continue
                t = codecs.open(fname, "r", encoding="utf-8").read()
                #p((row_dict["text"],t))
                assert row_dict["text"] == t
                assert isinstance(row_dict, dict)
                assert len(row_dict) == 6
                assert 'text' in row_dict
                assert 'star_constellation' in row_dict
                assert 'working_area' in row_dict
                assert 'age' in row_dict
                assert 'id' in row_dict
                assert 'gender' in row_dict
        assert number_of_found_files == i
Beispiel #3
0
 def test_exporter_initialisation_with_reader_obj_001(self):
     self.blogger_corpus()
     reader = Reader(os.path.join(self.tempdir_blogger_corp,
                                  self.txt_blogger_hightrepetativ_set),
                     "txt",
                     regex_template="blogger",
                     mode=self.mode)
     exporter = Exporter(reader.getlazy(), mode=self.mode)
     exporter.should.be.a(Exporter)
 def test_lazyreader_from_twitter_json_for_given_colnames_512(self):
     self.twitter_corpus()
     end_file_marker = -1
     reader = Reader(os.path.join(self.tempdir_twitter_corp,
                                  self.json_twitter_set),
                     "json",
                     formatter_name="TwitterStreamAPI",
                     mode=self.mode,
                     end_file_marker=end_file_marker)
     for data in reader.getlazy(colnames=["text"]):
         if data == end_file_marker:
             continue
         if data:
             assert isinstance(data, dict)
             assert len(data) == 1
             assert 'text' in data
 def test_lazyreader_from_json_for_given_colnames_510(self):
     self.blogger_corpus()
     end_file_marker = -1
     reader = Reader(os.path.join(self.tempdir_blogger_corp,
                                  self.json_blogger_small_fake_set),
                     "json",
                     mode=self.mode,
                     end_file_marker=end_file_marker)
     for data in reader.getlazy(
             colnames=["text", 'star_constellation', 'gender']):
         if data == end_file_marker:
             continue
         assert isinstance(data, dict)
         assert len(data) == 3
         assert 'text' in data
         assert 'star_constellation' in data
         assert 'gender' in data
 def test_lazyreader_from_xml_for_given_colnames_507(self):
     self.blogger_corpus()
     end_file_marker = -1
     reader = Reader(os.path.join(self.tempdir_blogger_corp,
                                  self.xml_blogger_small_fake_set),
                     "xml",
                     mode=self.mode,
                     end_file_marker=end_file_marker)
     #reader = Reader(os.path.join(os.path.join(self.path_to_zas_rep_tools,self.path_to_test_sets_for_blogger_Corpus), self.xml_blogger_small_fake_set), "xml", mode=self.mode)
     for data in reader.getlazy(
             colnames=["text", 'star_constellation', 'gender']):
         if data == end_file_marker:
             continue
         assert isinstance(data, dict)
         assert len(data) == 3
         assert 'text' in data
         assert 'star_constellation' in data
         assert 'gender' in data
 def test_lazyreader_from_json_with_utf8_509(self):
     self.blogger_corpus()
     end_file_marker = -1
     reader = Reader(os.path.join(self.tempdir_blogger_corp,
                                  self.json_blogger_hightrepetativ_set),
                     "json",
                     mode=self.mode,
                     end_file_marker=end_file_marker)
     for data in reader.getlazy():
         if data == end_file_marker:
             continue
         assert isinstance(data, dict)
         assert len(data) == 6
         assert 'text' in data
         assert 'star_constellation' in data
         assert 'working_area' in data
         assert 'age' in data
         assert 'id' in data
         assert 'gender' in data
Beispiel #8
0
    def test_export_to_sqlite_from_reader_007(self):
        self.blogger_corpus()
        self.prj_folder()
        self.blogger_lists()
        reader = Reader(os.path.join(self.tempdir_blogger_corp,
                                     self.txt_blogger_hightrepetativ_set),
                        "txt",
                        send_end_file_marker=False,
                        regex_template="blogger",
                        mode=self.mode)
        exporter = Exporter(reader.getlazy(), mode=self.mode)
        dbname = "blogger_corpus"

        exporter.tosqlite(self.tempdir_project_folder, dbname, self.fieldnames)

        for item in os.listdir(self.tempdir_project_folder):
            if ".db" in item:
                if dbname not in item:
                    assert False
 def test_lazyreader_from_twitter_json_with_utf8_511(self):
     self.twitter_corpus()
     end_file_marker = -1
     reader = Reader(os.path.join(self.tempdir_twitter_corp,
                                  self.json_twitter_set),
                     "json",
                     formatter_name="TwitterStreamAPI",
                     mode=self.mode,
                     end_file_marker=end_file_marker)
     for data in reader.getlazy():
         if data == end_file_marker:
             continue
         if data:
             #p(data, c="r")
             assert isinstance(data, dict)
             #p(data["text"])
             assert 'text' in data
             assert 'u_lang' in data
             assert 'id' in data
             assert 'u_id' in data
 def test_lazyreader_from_csv_with_utf8_503(self):
     self.blogger_corpus()
     end_file_marker = -1
     reader = Reader(os.path.join(self.tempdir_blogger_corp,
                                  self.csv_blogger_hightrepetativ_set),
                     "csv",
                     mode=self.mode,
                     end_file_marker=end_file_marker)
     for data in reader.getlazy():
         if data == end_file_marker:
             continue
         assert isinstance(data, dict)
         assert len(data) == len(
             self.configer.docs_row_values(token=True,
                                           unicode_str=True)["blogger"][0])
         assert 'text' in data
         assert 'star_constellation' in data
         assert 'working_area' in data
         assert 'age' in data
         assert 'id' in data
         assert 'gender' in data
 def test_lazyreader_from_sifter_twitter_csv_with_utf8_513(self):
     self.twitter_corpus()
     end_file_marker = -1
     #self.mode = "prod+"
     reader = Reader(os.path.join(self.tempdir_twitter_corp,
                                  "CSV/zas-rep-tool/sifter"),
                     "csv",
                     formatter_name="sifter",
                     mode=self.mode,
                     end_file_marker=end_file_marker)
     for data in reader.getlazy(csvdelimiter=";"):
         if data == end_file_marker:
             continue
         if data:
             #p(data, c="r")
             assert isinstance(data, dict)
             #p(data["text"])
             assert 'text' in data
             assert 'u_lang' in data
             assert 'id' in data
             assert 'u_id' in data
 def test_lazyreader_from_txt_500(self):
     self.blogger_corpus()
     end_file_marker = -1
     reader = Reader(os.path.join(self.tempdir_blogger_corp,
                                  self.txt_blogger_small_fake_set),
                     "txt",
                     regex_template="blogger",
                     mode=self.mode,
                     end_file_marker=end_file_marker)
     for data in reader.getlazy():
         #p(data)
         if data == end_file_marker:
             continue
         assert isinstance(data, dict)
         assert len(data) == 6
         assert 'text' in data
         assert 'star_constellation' in data
         assert 'working_area' in data
         assert 'age' in data
         assert 'id' in data
         assert 'gender' in data
 def test_lazyreader_from_xml_with_ascii_505(self):
     self.blogger_corpus()
     end_file_marker = -1
     reader = Reader(os.path.join(self.tempdir_blogger_corp,
                                  self.xml_blogger_small_fake_set),
                     "xml",
                     mode=self.mode,
                     end_file_marker=end_file_marker)
     #p(reader.getlazy())
     #p(len(reader.getlazy()))
     for data in reader.getlazy():
         if data == end_file_marker:
             continue
         #p(data)
         #p(type(data))
         assert isinstance(data, dict)
         assert len(data) == 6
         assert 'text' in data
         assert 'star_constellation' in data
         assert 'working_area' in data
         assert 'age' in data
         assert 'id' in data
         assert 'gender' in data
    def test_getlazy_many_streams_from_json_also_getted_from_zips_519(self):
        self.blogger_corpus()

        # Test 1: Check if number of getted files is correct
        end_file_marker = -1
        reader = Reader(os.path.join(self.tempdir_blogger_corp),
                        "json",
                        mode=self.mode,
                        read_from_zip=True,
                        end_file_marker=end_file_marker,
                        send_end_file_marker=True)
        number_of_found_files = reader._get_number_of_left_over_files()
        if number_of_found_files < 3:
            assert False

        if reader.files_number_in_zips != len(reader.files_to_read_orig):
            ## for this, it is important that the main folder of the test cases should be zipped!! That there is the same number of files
            assert False

        number_getted_files = len([
            row for gen in reader.getlazy(
                stream_number=4, adjust_to_cpu=True, min_files_pro_stream=5)
            for row in gen if row == end_file_marker
        ])
        if number_of_found_files != number_getted_files:
            assert False

        #p((number_of_found_files, number_getted_files), "number_of_found_files != number_getted_files")

        # Test 2: check if right number of streams will be returned
        len(
            reader.getlazy(stream_number=4,
                           adjust_to_cpu=True,
                           min_files_pro_stream=5)).should.be.equal(
                               get_number_of_streams_adjust_cpu(
                                   5, number_of_found_files, 4))
        len(
            reader.getlazy(stream_number=4,
                           adjust_to_cpu=True,
                           min_files_pro_stream=3)).should.be.equal(
                               get_number_of_streams_adjust_cpu(
                                   3, number_of_found_files, 4))
        len(
            reader.getlazy(stream_number=4,
                           adjust_to_cpu=True,
                           min_files_pro_stream=2)).should.be.equal(
                               get_number_of_streams_adjust_cpu(
                                   2, number_of_found_files, 4))
        len(
            reader.getlazy(stream_number=4,
                           adjust_to_cpu=True,
                           min_files_pro_stream=1)).should.be.equal(
                               get_number_of_streams_adjust_cpu(
                                   1, number_of_found_files, 4))

        i = 0
        for gen in reader.getlazy(stream_number=1000,
                                  adjust_to_cpu=True,
                                  min_files_pro_stream=1):
            for row_dict in gen:
                #p(row_dict)
                #i+=1
                if row_dict == end_file_marker:
                    i += 1
                    continue
                assert isinstance(row_dict, dict)
                assert len(row_dict) == 6
                assert 'text' in row_dict
                assert 'star_constellation' in row_dict
                assert 'working_area' in row_dict
                assert 'age' in row_dict
                assert 'id' in row_dict
                assert 'gender' in row_dict
        assert number_of_found_files == i
Beispiel #15
0
    def create_testsets_in_diff_file_formats(self, rewrite=False, abs_path_to_storage_place=False, silent_ignore = True):
        #p(abs_path_to_storage_place)
        #sys.exit()
        if not  rewrite:
            rewrite = self._rewrite
        if not abs_path_to_storage_place:
            abs_path_to_storage_place = self._path_to_zas_rep_tools
        #p("fghjk")
        created_sets = []
        if not abs_path_to_storage_place:
            sys.exit()
        try:
            # make test_sets for Blogger Corp 
            for  file_format, test_sets in self._types_folder_names_of_testsets.iteritems():
                for  name_of_test_set, folder_for_test_set in test_sets.iteritems():
                    if file_format == "txt":
                        continue
                    abs_path_to_current_test_case = os.path.join(abs_path_to_storage_place, self._path_to_testsets["blogger"], folder_for_test_set)
                    # p((file_format, name_of_test_set))
                    # p(abs_path_to_current_test_case)
                    if rewrite:
                        if os.path.isdir(abs_path_to_current_test_case):
                            shutil.rmtree(abs_path_to_current_test_case)
                            #os.remove(abs_path_to_current_test_case)

                    if not os.path.isdir(abs_path_to_current_test_case):
                        os.makedirs(abs_path_to_current_test_case)


                    path_to_txt_corpus = os.path.join(self.path_to_zas_rep_tools,self._path_to_testsets["blogger"] , self._types_folder_names_of_testsets["txt"][name_of_test_set] )

                            

                    reader = Reader(path_to_txt_corpus, "txt", regex_template="blogger",logger_level= self._logger_level,logger_traceback=self._logger_traceback, logger_folder_to_save=self._logger_folder_to_save,logger_usage=self._logger_usage, logger_save_logs= self._logger_save_logs, mode=self._mode ,  error_tracking=self._error_tracking,  ext_tb= self._ext_tb)
                    exporter = Exporter(reader.getlazy(),  rewrite=rewrite, silent_ignore=silent_ignore, logger_level= self._logger_level,logger_traceback=self._logger_traceback, logger_folder_to_save=self._logger_folder_to_save,logger_usage=self._logger_usage, logger_save_logs= self._logger_save_logs, mode=self._mode ,  error_tracking=self._error_tracking,  ext_tb= self._ext_tb)

                    if file_format == "csv":
                        if name_of_test_set == "small":
                            flag = exporter.tocsv(abs_path_to_current_test_case, "blogger_corpus",self._columns_in_doc_table["blogger"], rows_limit_in_file=5)
                            if not flag:
                                yield False
                            else:
                                created_sets.append("csv")
                                yield True
                        else:
                            flag= exporter.tocsv(abs_path_to_current_test_case, "blogger_corpus",self._columns_in_doc_table["blogger"], rows_limit_in_file=2)
                            if not flag:
                                yield False
                            else:
                                created_sets.append("csv")
                                yield True
                        
                    

                    elif file_format == "xml":
                        if name_of_test_set == "small":
                            flag = exporter.toxml(abs_path_to_current_test_case, "blogger_corpus", rows_limit_in_file=5)
                            if not flag:
                                yield False
                            else:
                                created_sets.append("xml")
                                yield True
                        else:
                            flag = exporter.toxml(abs_path_to_current_test_case, "blogger_corpus", rows_limit_in_file=2)
                            if not flag:
                                yield False
                            else:
                                created_sets.append("xml")
                                yield True


                    elif file_format == "json":
                        if name_of_test_set == "small":
                            flag = exporter.tojson(abs_path_to_current_test_case, "blogger_corpus", rows_limit_in_file=5)
                            if not flag:
                                yield False
                            else:
                                created_sets.append("json")
                                yield True
                        
                        else:
                            flag = exporter.tojson(abs_path_to_current_test_case, "blogger_corpus", rows_limit_in_file=2)
                            if not flag:
                                yield False
                            else:
                                created_sets.append("json")
                                yield True
  


                    elif file_format == "sqlite":
                        flag = exporter.tosqlite(abs_path_to_current_test_case, "blogger_corpus",self._columns_in_doc_table["blogger"])
                        if not flag:
                            yield False
                        else:
                            created_sets.append("sqlite")
                            yield True

            #p(created_sets, "created_sets")
            for created_set in set(created_sets):
                path_to_set = os.path.join(abs_path_to_storage_place, self._path_to_testsets["blogger"], created_set)
                #p(path_to_set)
                #p(os.path.join(os.path.split(path_to_set)[0], created_set+".zip"))
                make_zipfile(os.path.join(os.path.split(path_to_set)[0], created_set+".zip"), path_to_set)

            self.logger.info("TestSets (diff file formats) was initialized.")
        except Exception, e:
            print_exc_plus() if self._ext_tb else ""
            self.logger.error("SubsetsCreaterError: Throw following Exception: '{}'. ".format(e), exc_info=self._logger_traceback)
    def test_create_all_test_cases_for_diff_fileformats_502(self):
        self.prj_folder()
        configer = TestsConfiger(mode=self.mode)
        abs_path_to_storage_place = self.tempdir_project_folder

        #sys.exit()
        #configer.create_testsets_in_diff_file_formats(rewrite=True,abs_path_to_storage_place=abs_path_to_storage_place)
        returned_flags = set(
            list(
                configer.create_testsets_in_diff_file_formats(
                    rewrite=False,
                    abs_path_to_storage_place=abs_path_to_storage_place)))
        #p(returned_flags)
        if not (len(returned_flags) > 1) or True not in returned_flags:
            return False
        #sys.exit()
        for file_format, test_sets in configer.types_folder_names_of_testsets.iteritems(
        ):
            for name_of_test_set, folder_for_test_set in test_sets.iteritems():
                if file_format == "txt":
                    continue

                if file_format == "sqlite":
                    continue
                abs_path_to_current_test_case = os.path.join(
                    abs_path_to_storage_place,
                    configer._path_to_testsets["blogger"], folder_for_test_set)
                #p(abs_path_to_current_test_case, c="r")

                if not os.path.isdir(abs_path_to_current_test_case):
                    os.makedirs(abs_path_to_current_test_case)

                #p(configer._types_folder_names_of_testsets)
                path_to_txt_corpus = os.path.join(
                    configer.path_to_zas_rep_tools,
                    configer._path_to_testsets["blogger"],
                    configer._types_folder_names_of_testsets["txt"]
                    [name_of_test_set])
                #p(path_to_txt_corpus)
                reader_txt = Reader(path_to_txt_corpus,
                                    "txt",
                                    regex_template="blogger",
                                    send_end_file_marker=False,
                                    mode=self.mode)
                reader_current_set = Reader(abs_path_to_current_test_case,
                                            file_format,
                                            send_end_file_marker=False,
                                            mode=self.mode)
                #p((list(reader_txt.getlazy()), ))
                data_from_txt = defaultdict(list)
                data_from_current_set = defaultdict(list)
                for item in reader_txt.getlazy():
                    for k, v in item.iteritems():
                        if unicode(v).isnumeric():
                            v = int(v)
                        data_from_txt[k].append(v)

                for item in reader_current_set.getlazy():
                    for k, v in item.iteritems():
                        if unicode(v).isnumeric():
                            v = int(v)
                        data_from_current_set[k].append(v)

                for col in self.configer.columns_in_doc_table["blogger"]:
                    #p(col)
                    if col != "rowid":
                        for txt_item, current_set_item in zip(
                                sorted(data_from_txt[col]),
                                sorted(data_from_current_set[col])):
                            #p((repr(txt_item), repr(current_set_item)))
                            if txt_item != current_set_item:
                                assert False
    def test_getlazy_many_streams_from_txt_without_given_number_of_streams_adjusted_for_current_cpu_520(
            self):
        self.blogger_corpus()
        end_file_marker = -1
        reader = Reader(os.path.join(self.tempdir_blogger_corp,
                                     self.txt_blogger_small_fake_set),
                        "txt",
                        regex_template="blogger",
                        mode=self.mode,
                        end_file_marker=end_file_marker,
                        send_end_file_marker=True)
        number_of_found_files = reader._get_number_of_left_over_files()
        #p(number_of_found_files)
        len(
            reader.getlazy(stream_number=4,
                           adjust_to_cpu=True,
                           min_files_pro_stream=3)).should.be.equal(
                               get_number_of_streams_adjust_cpu(
                                   3, number_of_found_files, 4))
        len(
            reader.getlazy(stream_number=4,
                           adjust_to_cpu=True,
                           min_files_pro_stream=5)).should.be.equal(
                               get_number_of_streams_adjust_cpu(
                                   3, number_of_found_files, 5))
        len(
            reader.getlazy(stream_number=4,
                           adjust_to_cpu=True,
                           min_files_pro_stream=3)).should.be.equal(
                               get_number_of_streams_adjust_cpu(
                                   3, number_of_found_files, 4))
        len(
            reader.getlazy(stream_number=4,
                           adjust_to_cpu=True,
                           min_files_pro_stream=2)).should.be.equal(
                               get_number_of_streams_adjust_cpu(
                                   2, number_of_found_files, 4))
        len(
            reader.getlazy(stream_number=4,
                           adjust_to_cpu=True,
                           min_files_pro_stream=1)).should.be.equal(
                               get_number_of_streams_adjust_cpu(
                                   1, number_of_found_files, 4))

        i = 0
        for gen in reader.getlazy(stream_number=1000,
                                  adjust_to_cpu=True,
                                  min_files_pro_stream=1):
            for row_dict in gen:
                if row_dict == end_file_marker:
                    i += 1
                    continue
                assert isinstance(row_dict, dict)
                assert len(row_dict) == 6
                assert 'text' in row_dict
                assert 'star_constellation' in row_dict
                assert 'working_area' in row_dict
                assert 'age' in row_dict
                assert 'id' in row_dict
                assert 'gender' in row_dict
        #p((number_of_found_files, i))
        assert number_of_found_files == i