def main(dataset_path, percent_eval): OUTPUT_PATH = "/opt/ml/model" EXTRACT_PATH = "/home" TMP_PATH = "/home/tmp" if not os.path.exists(TMP_PATH): os.makedirs(TMP_PATH) if not os.path.exists(EXTRACT_PATH): os.makedirs(EXTRACT_PATH) try: shutil.copy(dataset_path, join(EXTRACT_PATH, 'data.tar')) except: print('unable to retrieve the dataset tar file.') sys.exit(1) with tarfile.open(join(EXTRACT_PATH, 'data.tar')) as tar_file: tar_file.extractall(join(EXTRACT_PATH, 'out')) if percent_eval > 100 or percent_eval < 100: percent_eval = 30 json_to_csv.main(percent_eval) generate_tfrecord.main(TMP_PATH + "/train.csv", join(OUTPUT_PATH, 'train.record')) generate_tfrecord.main(TMP_PATH + "/eval.csv", join(OUTPUT_PATH, 'eval.record')) parse_meta.main(join(OUTPUT_PATH, 'map.pbtxt')) print(".\nRecords generated")
def query(): patent_values = [] # patent 設定檔參數 item_list = [] # 需要搜尋的id list results_found = 0 # 搜尋設定檔 parser = configparser.ConfigParser() parser.read("./query_config.cfg") # 讀取設定檔 patent_values = get_config_value(parser, 0) item_list = list( set( open(os.path.join( patent_values[3], patent_values[2])).read().rstrip('\n').split('\n'))) for item in item_list: n = query_one_patent(item, patent_values, results_found) results_found = n if results_found == 0: print("Query returned no results") else: # 透過json產生csv json_to_csv.main(patent_values[3], parser.sections()[0], results_found) # 清除csv重複資料並儲存 output_filename = os.path.join(patent_values[3], parser.sections()[0] + '.csv') df = pd.read_csv(output_filename, dtype=object, encoding='Latin-1') df = df[patent_values[5]].drop_duplicates().sort_values( by=patent_values[6][0], ascending=[ direction != 'desc' for direction in patent_values[6][1] ]) df.to_csv(output_filename, index=False) print('({} rows returned)'.format(len(df)))
def main(dataset_paths, percent_eval, directory): OUTPUT_PATH = directory EXTRACT_PATH = "/home" TMP_PATH = "/home/tmp" if not os.path.exists(TMP_PATH): os.makedirs(TMP_PATH) if not os.path.exists(EXTRACT_PATH): os.makedirs(EXTRACT_PATH) try: for i in dataset_paths: shutil.copy(i, join(EXTRACT_PATH, 'data.tar')) except: print('unable to retrieve a dataset tar file:') sys.exit(1) for dataset in dataset_paths: with tarfile.open(dataset) as tar_file: tar_file.extractall(join(EXTRACT_PATH, 'out')) if percent_eval > 100 or percent_eval < 0: percent_eval = 30 json_to_csv.main(percent_eval) try: generate_tfrecord.main(TMP_PATH + "/train.csv", join(OUTPUT_PATH, 'train.record')) generate_tfrecord.main(TMP_PATH + "/eval.csv", join(OUTPUT_PATH, 'eval.record')) parse_meta.main(join(OUTPUT_PATH, 'map.pbtxt')) print(".\nRecords generated") except ValueError: print( "The datasets provided do not have the same class labels. Please make sure that labels are spelt the same in both datasets, or label the same objects for both datasets." )
def query(configfile): # Query the PatentsView database using parameters specified in configfile parser = configparser.ConfigParser() parser.read(configfile) # Loop through the separate queries listed in the config file. for q in parser.sections(): print("Running query: ", q) # Parse parameters from config file entity = json.loads(parser.get(q, 'entity')) url = 'http://www.patentsview.org/api/' + entity + '/query?' input_file = json.loads(parser.get(q, 'input_file')) directory = json.loads(parser.get(q, 'directory')) input_type = json.loads(parser.get(q, 'input_type')) fields = json.loads(parser.get(q, 'fields')) try: # If specified, 'sort' should be a list of dictionaries, specifying # the order of keys and direction of each key. sort = json.loads(parser.get(q, 'sort')) sort_fields, sort_directions = [], [] for dct in sort: for field in dct: # We can only sort by fields that are in the data if field in fields: sort_fields.append(field) sort_directions.append(dct[field]) if len(sort_fields) == 0: sort_fields = [fields[0]] sort_directions = ["asc"] except: sort_fields = [fields[0]] sort_directions = ["asc"] criteria = { "_and": [ json.loads(parser.get(q, option)) for option in parser.options(q) if option.startswith('criteria') ] } item_list = list( set(open(os.path.join(directory, input_file)).read().split('\n'))) results_found = 0 item_list_len = len(item_list) for item in item_list: params = { 'q': { "_and": [{ input_type: item }, criteria] }, 'f': fields } r = requests.post(url, data=json.dumps(params)) if 400 <= r.status_code <= 499: print("Client error when quering for value {}".format(item)) elif r.status_code >= 500: print( "Server error when quering for value {}. You may be exceeding the maximum API request size (1GB)." .format(item)) elif json.loads(r.text)['count'] != 0: outp = open(os.path.join(directory, q + '_' + \ str(results_found) + '.json'), 'w') print(r.text, end='', file=outp) outp.close() results_found += 1 if results_found == 0: print("Query {} returned no results".format(q)) else: # Output merged CSV of formatted results. json_to_csv.main(directory, q, results_found) # Clean csv: reorder columns, drop duplicates, sort, then save output_filename = os.path.join(directory, q + '.csv') df = pd.read_csv(output_filename, dtype=object, encoding='Latin-1') df = df[fields].drop_duplicates().sort_values( by=sort_fields, ascending=[ direction != 'desc' for direction in sort_directions ]) df.to_csv(output_filename, index=False) print('({} rows returned)'.format(len(df)))
def main(dataset_paths, percent_eval, directory): ROOT_PATH, PATH_EXT = os.path.splitext(dataset_paths) DATASET_NAME = ROOT_PATH.split('/')[-1] OUTPUT_PATH = directory EXTRACT_PATH = "/home" TMP_PATH = "/home/tmp" # Placeholder for enum, here 1 is tar, 0 is ZIP NORMAL_MODE = 1 # Assume this is a tar if not os.path.exists(TMP_PATH): os.makedirs(TMP_PATH) if not os.path.exists(EXTRACT_PATH): os.makedirs(EXTRACT_PATH) if PATH_EXT == '.zip': print( '.zip file extension found, interpreting as tensorflow object detection csv zip' ) NORMAL_MODE = 0 # Not a tar file if NORMAL_MODE: # Perform working tar code print("normal mode") try: shutil.copy(dataset_paths, join(EXTRACT_PATH, 'data.tar')) except: print('unable to retrieve a dataset tar file:') sys.exit(1) with tarfile.open(dataset_paths) as tar_file: tar_file.extractall(join(EXTRACT_PATH, 'out')) if percent_eval > 100 or percent_eval < 0: percent_eval = 30 json_to_csv.main(percent_eval) try: generate_tfrecord.main(TMP_PATH + "/train.csv", join(OUTPUT_PATH, 'train.record'), NORMAL_MODE, "/home/") generate_tfrecord.main(TMP_PATH + "/eval.csv", join(OUTPUT_PATH, 'eval.record'), NORMAL_MODE, "/home/") parse_meta.main(join(OUTPUT_PATH, 'map.pbtxt'), NORMAL_MODE, TMP_PATH + "/eval.csv") print(".\nRecords generated") except ValueError: print( "The datasets provided do not have the same class labels. Please make sure that labels are spelt the same in both datasets, or label the same objects for both datasets." ) if not NORMAL_MODE: print('treating as zip of tf obj detect') #Psuedocode #Unzip the zip in correct dir with zipfile.ZipFile( dataset_paths, 'r' ) as zip_file: # Unzip the file (Assuming 1 zip at this time) namelist = zip_file.namelist()[-1] if any( [namelist.startswith(i) for i in ["valid", "train", "test"]]): zip_file.extractall(EXTRACT_PATH + "/" + DATASET_NAME) else: zip_file.extractall(EXTRACT_PATH) from fnmatch import fnmatch pattern = "*.csv" for path, subdirs, files in os.walk(EXTRACT_PATH): for name in files: if fnmatch(name, pattern): print("CSV:", os.path.join(path, name)) #Generate the records # try: print(EXTRACT_PATH + "/" + DATASET_NAME + "/test/_annotations.csv") generate_tfrecord.main( EXTRACT_PATH + "/" + DATASET_NAME + "/test/_annotations.csv", join(OUTPUT_PATH, 'eval.record'), NORMAL_MODE, EXTRACT_PATH + "/" + DATASET_NAME + "/test/") generate_tfrecord.main( EXTRACT_PATH + "/" + DATASET_NAME + "/train/_annotations.csv", join(OUTPUT_PATH, 'train.record'), NORMAL_MODE, EXTRACT_PATH + "/" + DATASET_NAME + "/train/") print('main records generated') parse_meta.main( join(OUTPUT_PATH, 'map.pbtxt'), NORMAL_MODE, EXTRACT_PATH + "/" + DATASET_NAME + "/train/_annotations.csv") # Edge case of missing label in one csv print(".\nRecords generated")