def download(oaid, rec): if ('<error code="idDoesNotExist">' not in rec) or ("hasFile:true" in rec): splits = re.split( '<dc:identifier xsi:type="tel:URL">|</dc:identifier>', rec) for url in splits: if "/download/" in url: r = requests.get(url) d = str(r.headers['Content-disposition']) filename = str(re.findall("filename=(.+)", d)[0]).replace('"', '') if not os.path.exists(downloadpath + oaid): os.makedirs(downloadpath + oaid) target = downloadpath + oaid + "/" + filename wget.download(url, target) # Wenn kein PDF vorhanden ist, wird zu 99% eine zip datei da sein. Diese soll entpackt und der inhalt validiert werden if fnmatch.filter(os.listdir(downloadpath + oaid), "*.zip"): with zipfile.ZipFile(target, 'r') as zip_ref: zip_ref.extractall(downloadpath + oaid) filelist = os.listdir(downloadpath + oaid) for file in filelist: if ("zip" or "json") not in file: validator.main(downloadpath + oaid + "/" + file, downloadpath + oaid + "/") return True else: return False
def test_relations_happy(self) -> None: """Tests the happy relations path.""" argv = ["", "tests/data/relations.yaml"] with unittest.mock.patch('sys.argv', argv): ret: List[int] = [] with unittest.mock.patch('sys.exit', mock_sys_exit(ret)): validator.main() self.assertEqual(ret, [])
def test_relations_happy(self) -> None: """Tests the happy relations path.""" paths = [ "tests/data/relations.yaml", "tests/data/relation-gazdagret-filter-invalid-good.yaml", "tests/data/relation-gazdagret-filter-invalid-good2.yaml", ] for path in paths: argv = ["", path] with unittest.mock.patch('sys.argv', argv): ret: List[int] = [] with unittest.mock.patch('sys.exit', mock_sys_exit(ret)): validator.main() self.assertEqual(ret, [])
def main(): # get the inputs input_truth = '/mnt/work/input/truth' input_test = '/mnt/work/input/test' string_ports = '/mnt/work/input/ports.json' # create output directory out_path = '/mnt/work/output/data' if os.path.exists(out_path) is False: os.makedirs(out_path) # read the inputs with open(string_ports) as ports: inputs = json.load(ports) iou = inputs.get('iou', 'False') out_csv = inputs.get('out_csv', 'results.csv') # convert the inputs to the correct dtypes iou = convert_type(iou, bool, 'Boolean') out_csv = convert_type(out_csv, str, 'String') # get the shp or geojson in the input truth folder truth_vectors = glob.glob1(input_truth, '*.shp') truth_vectors += glob.glob1(input_truth, '*.geojson') if len(truth_vectors) == 0: raise ValueError("No shp or geojson files found in input data port 'truth'") if len(truth_vectors) > 1: raise ValueError("Multiple shp or geojson found in input data port 'truth'") in_truth = os.path.join(input_truth, truth_vectors[0]) # get the shp or geojson in the input test folder test_vectors = glob.glob1(input_test, '*.shp') test_vectors += glob.glob1(input_test, '*.geojson') if len(test_vectors) == 0: raise ValueError("No shp or geojson files found in input data port 'test'") if len(test_vectors) > 1: raise ValueError("Multiple shp or geojson found in input data port 'test'") in_test = os.path.join(input_test, test_vectors[0]) # set the output file path out = os.path.join(out_path, out_csv) print("Validating test data against truth data...") # run the processing validator.main([in_test, in_truth, out, '-u', iou]) print("Validation process completed successfully.")
def assert_failure_msg(self, path: str, expected: str) -> None: """Asserts that a given input fails with a given error message.""" # Set up arguments. argv = ["", path] with unittest.mock.patch('sys.argv', argv): # Capture standard output. buf = io.StringIO() with unittest.mock.patch('sys.stdout', buf): # Capture exit code. ret: List[int] = [] with unittest.mock.patch('sys.exit', mock_sys_exit(ret)): validator.main() self.assertEqual(ret, [1]) buf.seek(0) self.assertEqual(buf.read(), expected)
def test_relation_happy(self) -> None: """Tests the happy relation path.""" # Set up arguments. argv = ["", "tests/data/relation-gazdagret.yaml"] with unittest.mock.patch('sys.argv', argv): # Capture standard output. buf = io.StringIO() with unittest.mock.patch('sys.stdout', buf): # Capture exit code. ret: List[int] = [] with unittest.mock.patch('sys.exit', mock_sys_exit(ret)): validator.main() self.assertEqual(ret, []) buf.seek(0) self.assertEqual(buf.read(), "")
def test_relations_missing_osmrelation(self) -> None: """Tests the missing-osmrelation relations path.""" # Set up arguments. argv = ["", "tests/data/relations-missing-osmrelation/relations.yaml"] with unittest.mock.patch('sys.argv', argv): # Capture standard output. buf = io.StringIO() with unittest.mock.patch('sys.stdout', buf): # Capture exit code. ret: List[int] = [] with unittest.mock.patch('sys.exit', mock_sys_exit(ret)): validator.main() self.assertEqual(ret, [1]) buf.seek(0) expected = "failed to validate tests/data/relations-missing-osmrelation/relations.yaml" expected += ": missing key 'gazdagret.osmrelation'\n" self.assertEqual(buf.read(), expected)
def assert_failure_msg(self, path: str, expected: str) -> None: """Asserts that a given input fails with a given error message.""" argv = ["", path] buf = io.StringIO() ret = validator.main(argv, buf) self.assertEqual(ret, 1) buf.seek(0) self.assertEqual(buf.read(), expected)
def test_relation_happy(self) -> None: """Tests the happy relation path.""" # Set up arguments. argv = ["", "tests/data/relation-gazdagret.yaml"] buf = io.StringIO() ret = validator.main(argv, buf) self.assertEqual(ret, 0) buf.seek(0) self.assertEqual(buf.read(), "")
def try_password(password): """ Função para mostrar a senha e verifica se ela é verdadeira. """ if app.args.show == "true": print("\r Trying %s..." % password, end="") return True if validator.main(password, app.args.command) == app.args.code else False
def test_relations_missing_osmrelation(self) -> None: """Tests the missing-osmrelation relations path.""" # Set up arguments. argv = ["", "tests/data/relations-missing-osmrelation/relations.yaml"] buf = io.StringIO() ret = validator.main(argv, buf) self.assertEqual(ret, 1) buf.seek(0) expected = "failed to validate tests/data/relations-missing-osmrelation/relations.yaml" expected += ": missing key 'gazdagret.osmrelation'\n" self.assertEqual(buf.read(), expected)
def test_relations_happy(self) -> None: """Tests the happy relations path.""" paths = [ "tests/data/relations.yaml", "tests/data/relation-gazdagret-filter-invalid-good.yaml", "tests/data/relation-gazdagret-filter-invalid-good2.yaml", "tests/data/relation-gazdagret-filter-valid-good.yaml", "tests/data/relation-gazdagret-filter-valid-good2.yaml", ] for path in paths: argv = ["", path] ret = validator.main(argv, io.StringIO()) self.assertEqual(ret, 0)
def main(): print "Options: 1) Enter a word to check its best spelling suggestion" print "2) Enter validate() to validate an input" print "3) Enter exit() to exit the program" while 1: got =0 inputstr =raw_input("> ").lower() if 'exit()' in inputstr: break elif 'validate()' in inputstr: ipst=validator.main() validate(ipst) continue else: if check_presence(inputstr): got =1 print inputstr continue extr_unique = extract_unique(inputstr) if check_presence(extr_unique): got =1 print extr_unique continue poss_dups = possibile_dups(inputstr) for p in poss_dups: if check_presence(p): got =1 print p break for p in poss_dups: for w in vowels(p): if check_presence(w) and got==0: got =1 print w break if got ==0: print "NO SUGGESTION" continue
def main(file_name, **kwargs): # check each file against the registry # determine if its spam, a duplicate, or should be ingested # copy the file to the appropriate place, check for equal file size, and delete one of the files appropriately metadata = {} write_path = instance_guid = stage = header = "" file_type = "raw" # json.dumps(metadata) # Log the PID to help in debugging logger.info('Pid : ' + str(os.getpid())) try: # attempt to get the registry entry. If Alfred isn't working properly we'll get a connection error if file_name.startswith('sbx_'): # asking the registry for sandbox file and stripping "sbx_" off the file name, keys don't have the prefix metadata = reg.get_metadata(file_name[4:], stage='sandbox') else: metadata = reg.get_metadata(file_name) except requests.ConnectionError as e: # log response error logger.error('Failed to connect to Alfred : ' + str(e)) exit(e) if 'stage' in metadata: stage = metadata['stage'] # get the count of the number of rows in the source file row_count = file_len(landing_zone + '/' + file_name) logger.info('row count = ' + str(row_count)) if 'file' in metadata and metadata['file'] != {}: # a registry entry exists for the file, process it logger.info("Moving " + file_name + " to hdfs://" + reg.file_path(metadata, **kwargs)) # set the write path based on the metadata write_path = reg.file_path(metadata, **kwargs) logger.info("Moving " + file_name + " to " + write_path) if stage == 'sandbox' and hdfs.path.exists(write_path + '/' + file_name): # in the case of sandbox files previous data is always overwritten logging.info("Sandbox file already exists, overwriting") # Delete from HDFS is not strictly needed if the table was created as external hdfs.rmr(write_path + '/' + file_name) # set up a hive connection hive = validator.Hive() # use the hive connection to delete the sandbox table hive.drop_table(metadata, stage=stage) # close the hive connection hive = None # check to make sure the file doesn't already exist if not hdfs.path.exists(write_path + '/' + file_name): # if it doesn't, write it to the appropriate location hdfs.put(landing_zone + '/' + file_name, write_path + '/' + file_name) # create second copy for work table unless its a sandbox file if stage != 'sandbox': # create work copy write path work_write_path = reg.file_path(metadata, type='work', **kwargs) # delete the work file if there is already one present if hdfs.path.exists(work_write_path): logger.info("Deleting existing work files at " + work_write_path) hdfs.rmr(work_write_path) # write the file to the work file location hdfs.put(landing_zone + '/' + file_name, work_write_path + '/' + file_name) else: # if this is a sandbox file, we might need the header row, its far easier to get this now than from hdfs header = get_header(file_name) # register that the raw file was written instance_guid = reg.register_raw(metadata, file_name, file_type, row_count) else: # if the file does exist, its treated as a duplicate logger.info("Duplicate file") file_type = "duplicate" # set up duplicate write path write_path = reg.dup_file_path( metadata) # + '/' + metadata['file']['key'] #check to see if its a duplicate of an existing duplicate if hdfs.path.exists(write_path + '/' + file_name): # delete existing duplicate and write the new one. logging.info("duplicate file already exists, overwriting") hdfs.rmr(write_path + '/' + file_name) hdfs.put(landing_zone + '/' + file_name, write_path + '/' + file_name) logger.info("writing duplicate file " + write_path + '/' + file_name) reg.register_raw(metadata, file_name, file_type, row_count) else: # first time duplicates just get written hdfs.put(landing_zone + '/' + file_name, write_path + '/' + file_name) logger.info("writing duplicate file " + write_path + '/' + file_name) reg.register_raw(metadata, file_name, file_type, row_count) else: # no registry entry for this file, move it to spam file_type = "spam" # set up write path for spam write_path = reg.spam_file_path(metadata) logger.info("Moving " + file_name + " to " + write_path + '/' + file_name) #check to see if its a duplicate of an existing spam file if hdfs.path.exists(write_path + '/' + file_name): # delete existing spam and write the new one. logging.info("spam file already exists, overwriting") hdfs.rmr(write_path + '/' + file_name) hdfs.put(landing_zone + '/' + file_name, write_path + '/' + file_name) logger.info("writing spam file " + write_path + '/' + file_name) reg.register_raw(metadata, file_name, file_type, row_count) else: # first time spam gets written as normal hdfs.put(landing_zone + '/' + file_name, write_path + '/' + file_name) logger.info("writing spam file " + write_path + '/' + file_name) reg.register_raw(metadata, file_name, file_type, row_count) # confirm that source file and target file have the same size, regardless of spam, duplicate or normal if hdfs.path.exists(write_path + '/' + file_name) and \ hdfs.path.getsize(write_path + '/' + file_name) == os.stat(landing_zone + '/' + file_name).st_size: # if the file sizes match, delete the source file os.remove(landing_zone + '/' + file_name) logger.info("Landing zone file removed " + landing_zone + '/' + file_name) else: # if the file sizes do not match, delete the target file and rename the source file so it doesn't get reprocessed repeatedly logger.error( "Source and target file sizes didn't match, not deleting source.") hdfs.rmr(write_path + '/' + file_name) os.rename(landing_zone + '/' + file_name, landing_zone + '/' + file_name + '.err') raise ValueError("Source and target file sizes don't match") # copy only is an option set up in case there's ever a reason not to process beyond moving the file to HDFS if 'copy_only' not in kwargs or not kwargs['copy_only']: if file_type == "raw": # raw, meaning not spam or duplicate. No reason to validate those if stage != 'sandbox': # if its not a sandbox file proceed with full validation logger.info("Validate " + file_name) validator.main(file_name, instance_guid, metadata) elif stage == 'sandbox': # if it is a sandbox file, we need to mark it as such so validator only creates the table logger.info("Sandbox validate " + file_name) validator.main(file_name, instance_guid, metadata, header=header, stage=stage) # log that this PID is ending logger.info('Pid ending : ' + str(os.getpid()))