def get_info_from_substance_json(data): preview_url = utils.locate_item(data, ("label", "main"), return_as="parent")[0]["url"] extra_data = {dict["key"]: dict["value"] for dict in data["extraData"]} dimensions = {} physicalSize = extra_data.get("physicalSize") if physicalSize: for letter, dimension in zip('xyz', physicalSize.split("/")): dimensions[letter] = float(dimension) / 100.0 tags = data["tags"] tags.append(extra_data["type"]) info = { # "id": extra_data["originalName"], # not always "name": data["title"], "url": "https://source.substance3d.com/allassets/" + data["id"], "author": extra_data["author"], "author_url": "https://source.substance3d.com/", "licence": "EULA", "licence_url": "https://www.substance3d.com/legal/general-terms-conditions", "tags": tags, "preview_url": preview_url, # "description": "", "dimensions": dimensions } # info["preview_path"] = "" utils.remove_empty(info) return info
def get_info_from_sbsar_xml(xml_file): with open(xml_file, 'r', encoding="utf-8") as xml_text: from bs4 import BeautifulSoup soup = BeautifulSoup(xml_text.read(), "html.parser") graph = soup.find("graph") attrs = graph.attrs # type: dict tags = [] keywords = attrs.get("keywords") if keywords: tags = re.split(r" |;|,", keywords.strip("; ").lower()) category = attrs.get("category") if category: tags.extend(re.split(r" |/|,", category.lower())) tags = utils.deduplicate(tags) tags = list(filter(None, tags)) id = None pkgurl = attrs.get("pkgurl") if pkgurl: match = re.search(r"(?<=pkg:\/\/).+", pkgurl) if match: id = match.group(0) if id: name = id else: name = os.path.splitext(os.path.basename(xml_file))[0] label = attrs.get("label") if label: name = label.strip(" ") dimensions = {} physicalsize = attrs.get("physicalsize") if physicalsize: for letter, dimension in zip('xyz', physicalsize.split(",")): dimensions[letter] = float(dimension) / 100.0 info = { "id": id, "name": name, # "url": "", "author": attrs.get("author", ""), "author_url": attrs.get("authorurl", ""), # "licence": "", # "licence_url": "", "tags": tags, # "preview_url": "", "description": attrs.get("description", ""), "dimensions": dimensions, "xml_attrs": attrs } utils.remove_empty(info) return info
def on_patch(self, request, response, id: int): """(Partially) update a task. Example payload: `{"completed": true}` The following fields can be passed in the payload to be updated (any other field will be ignored): - title: str - due_date: str (ISO format) - completed: bool - priority: int """ task = self.get_object(id) due_date = request.get_json('due_date', default=None) due_date = read_datetime(due_date) updated_fields = { 'title': request.get_json('title', default=None), 'due_date': due_date, 'completed': request.get_json('completed', default=None), 'priority': request.get_json('priority', default=None), } cleaned_fields = remove_empty(updated_fields) if cleaned_fields: for field, value in cleaned_fields.items(): setattr(task, field, value) self.session.add(task) self.session.commit() response.status = falcon.HTTP_200 response.json = task.serialized
def predict_prob(self, batch_raw_texts): """ batch preprocessing can efficiently bosst qps due to using gpu's nature. paras: raw_texts: list of string """ # text-preprocessing batch_raw_texts = [remove_delimiter(raw_text) for raw_text in batch_raw_texts] batch_raw_texts = [remove_separator(raw_text) for raw_text in batch_raw_texts] batch_raw_texts = [remove_empty(raw_text) for raw_text in batch_raw_texts] batch_raw_texts = [remove_two_spaces(raw_text) for raw_text in batch_raw_texts] batch_raw_texts = [remove_three_spaces(raw_text) for raw_text in batch_raw_texts] # tokenize text_bert_indices = [] for text in batch_raw_texts: ls_tokens = self.tokenizer.text_to_sequence("[CLS] " + text) text_bert_indices.append(ls_tokens) # conver to tensor text_bert_indices = torch.tensor(text_bert_indices, dtype=torch.int64).to(self.opt.device) t_inputs = [text_bert_indices] t_outputs = self.model(t_inputs) t_probs = F.softmax(t_outputs, dim=-1).cpu().detach().numpy() return t_probs
def main(): # Hyper Parameters parser = argparse.ArgumentParser() parser.add_argument('--data_path', default='../data/Users/william.teo/Downloads/sqlite.db', type=str, help = "data path") parser.add_argument('--output_path', default='../data', type=str, help = "path to save train/test text data ") parser.add_argument('--table_name', default='data', type=str, help='table name') parser.add_argument('--train_test_ratio', default=0.2, type=float, help='set ratio between 0 and 1 for train/test split') opt = parser.parse_args() # loda data conn = sqlite3.connect(opt.data_path) df = pd.read_sql_query("SELECT * FROM {}".format(opt.table_name), conn).drop(columns = ["index"]) # #df = df.sample(100) # text cleaning df["text"] = df["text"].apply(lambda x : remove_delimiter(x)) df["text"] = df["text"].apply(lambda x : remove_separator(x)) df["text"] = df["text"].apply(lambda x : remove_empty(x)) df["text"] = df["text"].apply(lambda x : remove_two_spaces(x)) df["text"] = df["text"].apply(lambda x : remove_three_spaces(x)) # train/test split assert 0 <= opt.train_test_ratio < 1 df_train, df_test = train_test_split(df, test_size = opt.train_test_ratio) # save train/test result df_to_txt(df_train, os.path.join(opt.output_path,"train.txt")) df_to_txt(df_train, os.path.join(opt.output_path,"test.txt"))
def get_megascan_info_from_json(mega_info): name = None tags = mega_info.get("tags", []) tags.extend(mega_info.get("categories", [])) semantic_tags = mega_info.get("semanticTags") if semantic_tags: semantic_tags.pop("industry", None) for key, value in semantic_tags.items(): if isinstance(value, list): tags.extend(value) elif key in ("subject_matter", "asset_type"): tags.append(value) name = semantic_tags.get("name") if not name: name = mega_info.get("name", "") tags = list(map(lambda x: x.lower().strip(" "), dict.fromkeys(tags))) meta = {item["key"]: item["value"] for item in mega_info.get("meta", [])} number_pattern = re.compile("\d+(?:\.\d+)?") dimensions = {} x = meta.get("length") if x: x = float(number_pattern.search(x).group(0)) y = meta.get("width") if y: y = float(number_pattern.search(y).group(0)) if not x and not y: scan_area = meta.get("scanArea") if not scan_area: sizes = utils.locate_item(mega_info, "physicalSize", is_dict_key=True, return_as='data') if sizes: scan_area = Counter(sizes).most_common(1)[0][0] if scan_area: sizes = number_pattern.findall(scan_area) if len(sizes) == 2: x = float(sizes[0]) y = float(sizes[1]) elif len(sizes) == 1: x = y = float(sizes[0]) if x: dimensions['x'] = x if y: dimensions['y'] = y z = meta.get("height") if z: dimensions['z'] = float(number_pattern.search(z).group(0)) info = { # "id": "", # can get a slug from the json listing files "name": name, "url": f"https://quixel.com/megascans/home?assetId={mega_info['id']}", "author": "Quixel Megascans", "author_url": "https://quixel.com/megascans", "licence": "EULA", "licence_url": "https://quixel.com/terms", "tags": tags, # "preview_url": "", # probably the url is generated by some javascript # "description": "", # does not have it "dimensions": dimensions, } utils.remove_empty(info) return info
def get_web_ambientcg_info(url, content_folder): # https://cc0textures.com/view?id=Plaster003 # https://ambientcg.com/view?id=Bricks056 if "cc0textures.com" in url or "ambientcg.com" in url: match = re.search(r"(?<=id=)[a-zA-Z0-9]+", url) if not match: return False, "Not valid Ambient CG url." id = match.group(0) elif "cc0.link" in url: # https://cc0.link/a/Plaster003 url = url.split("?")[0].split("#")[0].rstrip("/") id = url.split("/")[-1] api_url = f"https://ambientcg.com/api/v2/full_json?id={id}&sort=Latest&limit=1&include=tagData%2CdisplayData%2CdimensionsData%2CdownloadData%2CpreviewData%2CimageData" headers = {'User-Agent': 'Blender'} import requests response = requests.get(api_url, headers=headers) if response.status_code != 200: return False, response.text json = response.json() asset = json["foundAssets"][0] if asset["dataType"] == "3DModel": return False, "3DModel is not supported yet." dimensions = {} for letter, name in zip('xyz', ("dimensionX", "dimensionY", "dimensionZ")): dimension = asset.get(name) if dimension: dimensions[letter] = int(dimension) / 100 info = { "id": id, "name": asset["displayName"], "url": f"https://ambientcg.com/view?id={id}", "author": "ambientcg", "author_url": "https://ambientcg.com", "licence": "CC0", "licence_url": "https://help.ambientcg.com/01-General/Licensing.html", "tags": asset["tags"], "preview_url": asset["previewImage"]["1024-PNG"], "description": asset.get("description"), "dimensions": dimensions } info['material_settings'] = {'Y- Normal Map': 1} if content_folder: download = utils.locate_item(asset["downloadFolders"], ("attribute", "4K-JPG"), return_as="parent")[0] url = download[ "downloadLink"] # "https://cc0textures.com/get?file=Plaster003_4K-PNG.zip" info["downloadLink"] = url info["fileName"] = download["fileName"] # "Plaster003_4K-PNG.zip" utils.remove_empty(info) return True, info
def get_web_texturehaven_info(url, content_folder): # https://texturehaven.com/tex/?t=brick_wall_003 url = url.split("#")[0] if not "texturehaven.com/tex/" in url: return False, "Not valid Texture Haven url." match = re.search(r"(?<=t=)[a-zA-Z0-9_]+", url) id = match.group(0) import requests response = requests.get(url) if response.status_code != 200: return False, response.text from bs4 import BeautifulSoup soup = BeautifulSoup(response.text, 'html.parser') dimensions = {} tags = [] for item in soup.find(name="div", id="item-info").findAll("li"): title = item.get("title") if not title: b = item.find('b') if b: title = b.string if title: if title.startswith("Author"): author = title.split(":")[1].strip() author_url = f"https://texturehaven.com/textures/?a={author}" elif title.startswith("Real-world"): dimensions_title = title.split(":")[1].strip() number_pattern = re.compile("\d+\.?\d*") for letter, number in zip( 'xyz', number_pattern.findall(dimensions_title)): dimensions[letter] = float(number) elif title.startswith(("Categories", "Tags")): tags.extend( [a.string.lower().strip() for a in item.findAll("a")]) preview_url = "https://texturehaven.com" + soup.find( name="div", id="item-preview").find("img")["src"] info = { "id": id, "name": id, "url": url, "author": author, "author_url": author_url, "licence": "CC0", "licence_url": "https://texturehaven.com/p/license.php", "tags": tags, "preview_url": preview_url, # "description": "", "dimensions": dimensions, } utils.remove_empty(info) if content_folder: downloads = [] # for a in soup.findAll("a"): # if a.get("download"): # href = a["href"] # if "/png/4k/" in href: # name = href.split("/")[-1].lower() # type = get_type(name) # if type and len(type) == 1 and type[0] in ('diffuse', 'albedo', 'displacement', 'normal', 'roughness', 'ambient_occlusion'): # downloads.append("https://texturehaven.com" + href) for a in soup.findAll("a"): if a.get("download"): href = a["href"] if "/4k/" in href: name = href.split("/")[-1].lower() type = type_definer.get_type( name, config={"is_rgb_plus_alpha": True}) if not type or len(type) != 1: continue type = type[0] if ("/jpg/4k/" in href and type in ('diffuse', 'albedo', 'normal', 'roughness', 'ambient_occlusion')) or ("/png/4k/" in href and type in ('displacement', )): downloads.append("https://texturehaven.com" + href) for download in downloads.copy(): if "dx_normal" in download.lower(): for _download in downloads.copy(): if "gl_normal" in _download.lower(): downloads.remove(download) downloads = utils.deduplicate(downloads) info["downloads"] = downloads return True, info
def main(_): tf.logging.set_verbosity(_verbosity_levels[tf.flags.FLAGS.verbosity]) params = { 'criterion': tf.flags.FLAGS.criterion, 'max_iter': tf.flags.FLAGS.maxiter, 'kernel': tf.flags.FLAGS.kernel, 'bandwidth': tf.flags.FLAGS.bandwidth, 'n_clusters': tf.flags.FLAGS.nclusters, 'batch_size': tf.flags.FLAGS.batchsize } data = generate_random(100, 500) # assert os.path.exists(tf.flags.FLAGS.data) if tf.flags.FLAGS.method in methods: cl = methods[tf.flags.FLAGS.method](**remove_empty(params)) labels = cl.fit(data) centroids = cl.centroids history = cl.history else: history = load(os.path.join(tf.flags.FLAGS.save, 'history.npy')) centroids = load(os.path.join(tf.flags.FLAGS.save, 'centroids.npy')) labels = load(os.path.join(tf.flags.FLAGS.save, 'labels.npy')) if tf.flags.FLAGS.method == 'visualize': assert len(history) > 1 \ and history[0].shape[0] == labels.shape[0], 'Invalid ' \ 'history' plot(history, data, labels, centroids, draw_lines=False) elif tf.flags.FLAGS.method == 'visualize_animated': assert len(history) > 1 \ and history[0].shape[0] == labels.shape[0], 'Invalid ' \ 'history' animated_plot(history, labels) else: raise ValueError('--mode parameter must either ' 'be < means_shift >' '< mini_batch_mean_shift >,' '< kmeans > or < mini_batch_kmeans >.') return if history is None: tf.logging.warn('Data is too large to visualize.') elif data.shape[1] != 2: tf.logging.warn('Data must be 2 dimensional to visualize.') else: tf.logging.info('Creating plot for history visualization.') plot(history, data, labels, centroids, draw_lines=False) save(os.path.join(tf.flags.FLAGS.save, 'history.npy'), history) save(os.path.join(tf.flags.FLAGS.save, 'centroids.npy'), centroids) save(os.path.join(tf.flags.FLAGS.save, 'labels.npy'), labels)
if __name__ == '__main__': pdf_dir = "/home/mahad/abbyy_dummy_dataset/pdf" xml_dir = "/home/mahad/abbyy_dummy_dataset/xml" save_dir = "/tmp" pdf_files = os.listdir(pdf_dir) xml_files = os.listdir(xml_dir) for xml_file in xml_files: print(xml_file) xml_path = os.path.join(xml_dir, xml_file) pdf_path = os.path.join(pdf_dir, Path(xml_file).stem + ".pdf") xml_data = get_raw_data(xml_path) for page in xml_data: para_boxes = page["para_boxes"] para_texts = page["para_texts"] para_boxes, para_texts = remove_empty(para_boxes, para_texts) tables = page["tables"] table_boxes = [tt["bbox"] for tt in tables] table_texts = [tt["rows"] for tt in tables] img = pdf2image.convert_from_path(pdf_path, size=(page["width"], page["height"]), first_page=page["page_number"], last_page=page["page_number"]) img = np.asarray(img[0]) all_boxes = para_boxes + table_boxes all_texts = para_texts + table_texts column_blocks = get_blocks((page["height"], page["width"]), all_boxes) column_blocks_merged = merge_blocks(column_blocks, all_boxes) ordered_boxes = create_order(column_blocks_merged, all_boxes) ordered_texts = [] for i in range(0, len(ordered_boxes)): idx = all_boxes.index(ordered_boxes[i]) ordered_texts.append(all_texts[idx])