Exemple #1
0
def get_info_from_substance_json(data):

    preview_url = utils.locate_item(data, ("label", "main"),
                                    return_as="parent")[0]["url"]

    extra_data = {dict["key"]: dict["value"] for dict in data["extraData"]}

    dimensions = {}
    physicalSize = extra_data.get("physicalSize")
    if physicalSize:
        for letter, dimension in zip('xyz', physicalSize.split("/")):
            dimensions[letter] = float(dimension) / 100.0

    tags = data["tags"]
    tags.append(extra_data["type"])

    info = {
        # "id": extra_data["originalName"], # not always
        "name": data["title"],
        "url": "https://source.substance3d.com/allassets/" + data["id"],
        "author": extra_data["author"],
        "author_url": "https://source.substance3d.com/",
        "licence": "EULA",
        "licence_url":
        "https://www.substance3d.com/legal/general-terms-conditions",
        "tags": tags,
        "preview_url": preview_url,
        # "description": "",
        "dimensions": dimensions
    }

    # info["preview_path"] = ""

    utils.remove_empty(info)
    return info
Exemple #2
0
def get_info_from_sbsar_xml(xml_file):
    with open(xml_file, 'r', encoding="utf-8") as xml_text:
        from bs4 import BeautifulSoup
        soup = BeautifulSoup(xml_text.read(), "html.parser")
        graph = soup.find("graph")
        attrs = graph.attrs  # type: dict

        tags = []
        keywords = attrs.get("keywords")
        if keywords:
            tags = re.split(r" |;|,", keywords.strip("; ").lower())

        category = attrs.get("category")
        if category:
            tags.extend(re.split(r" |/|,", category.lower()))

        tags = utils.deduplicate(tags)
        tags = list(filter(None, tags))

        id = None
        pkgurl = attrs.get("pkgurl")
        if pkgurl:
            match = re.search(r"(?<=pkg:\/\/).+", pkgurl)
            if match:
                id = match.group(0)

        if id:
            name = id
        else:
            name = os.path.splitext(os.path.basename(xml_file))[0]
        label = attrs.get("label")
        if label:
            name = label.strip(" ")

        dimensions = {}
        physicalsize = attrs.get("physicalsize")
        if physicalsize:
            for letter, dimension in zip('xyz', physicalsize.split(",")):
                dimensions[letter] = float(dimension) / 100.0

        info = {
            "id": id,
            "name": name,
            # "url": "",
            "author": attrs.get("author", ""),
            "author_url": attrs.get("authorurl", ""),
            # "licence": "",
            # "licence_url": "",
            "tags": tags,
            # "preview_url": "",
            "description": attrs.get("description", ""),
            "dimensions": dimensions,
            "xml_attrs": attrs
        }

        utils.remove_empty(info)
        return info
Exemple #3
0
    def on_patch(self, request, response, id: int):
        """(Partially) update a task.

        Example payload: `{"completed": true}`

        The following fields can be passed in the payload to be updated
        (any other field will be ignored):

        - title: str
        - due_date: str (ISO format)
        - completed: bool
        - priority: int
        """
        task = self.get_object(id)

        due_date = request.get_json('due_date', default=None)
        due_date = read_datetime(due_date)

        updated_fields = {
            'title': request.get_json('title', default=None),
            'due_date': due_date,
            'completed': request.get_json('completed', default=None),
            'priority': request.get_json('priority', default=None),
        }

        cleaned_fields = remove_empty(updated_fields)

        if cleaned_fields:
            for field, value in cleaned_fields.items():
                setattr(task, field, value)
            self.session.add(task)
            self.session.commit()

        response.status = falcon.HTTP_200
        response.json = task.serialized
    def predict_prob(self, batch_raw_texts):
        """
        batch preprocessing can efficiently bosst qps due to using gpu's nature.
        

        paras:
            raw_texts: list of string
        """
        # text-preprocessing
        batch_raw_texts = [remove_delimiter(raw_text) for raw_text in batch_raw_texts]
        batch_raw_texts = [remove_separator(raw_text) for raw_text in batch_raw_texts]
        batch_raw_texts = [remove_empty(raw_text) for raw_text in batch_raw_texts]
        batch_raw_texts = [remove_two_spaces(raw_text) for raw_text in batch_raw_texts]
        batch_raw_texts = [remove_three_spaces(raw_text) for raw_text in batch_raw_texts]
        # tokenize
        text_bert_indices = []
        for text in batch_raw_texts:
            ls_tokens = self.tokenizer.text_to_sequence("[CLS] " + text)
            text_bert_indices.append(ls_tokens)
        # conver to tensor
        text_bert_indices = torch.tensor(text_bert_indices, dtype=torch.int64).to(self.opt.device)

        t_inputs = [text_bert_indices]
        t_outputs = self.model(t_inputs)

        t_probs = F.softmax(t_outputs, dim=-1).cpu().detach().numpy()
        return t_probs
def main():
    # Hyper Parameters
    parser = argparse.ArgumentParser()
    parser.add_argument('--data_path', default='../data/Users/william.teo/Downloads/sqlite.db', type=str, help = "data path")
    parser.add_argument('--output_path', default='../data', type=str, help = "path to save train/test text data ")
    parser.add_argument('--table_name', default='data', type=str, help='table name')
    parser.add_argument('--train_test_ratio', default=0.2, type=float, help='set ratio between 0 and 1 for train/test split')
    opt = parser.parse_args()

    # loda data
    conn = sqlite3.connect(opt.data_path)
    df = pd.read_sql_query("SELECT * FROM {}".format(opt.table_name), conn).drop(columns = ["index"])
    #
    #df = df.sample(100)
    # text cleaning
    df["text"] = df["text"].apply(lambda x : remove_delimiter(x))
    df["text"] = df["text"].apply(lambda x : remove_separator(x))
    df["text"] = df["text"].apply(lambda x : remove_empty(x))
    df["text"] = df["text"].apply(lambda x : remove_two_spaces(x))
    df["text"] = df["text"].apply(lambda x : remove_three_spaces(x))

    # train/test split
    assert 0 <= opt.train_test_ratio < 1
    df_train, df_test = train_test_split(df, test_size = opt.train_test_ratio)
    # save train/test result
    df_to_txt(df_train, os.path.join(opt.output_path,"train.txt"))
    df_to_txt(df_train, os.path.join(opt.output_path,"test.txt"))
Exemple #6
0
def get_megascan_info_from_json(mega_info):

    name = None

    tags = mega_info.get("tags", [])
    tags.extend(mega_info.get("categories", []))

    semantic_tags = mega_info.get("semanticTags")
    if semantic_tags:
        semantic_tags.pop("industry", None)
        for key, value in semantic_tags.items():
            if isinstance(value, list):
                tags.extend(value)
            elif key in ("subject_matter", "asset_type"):
                tags.append(value)

        name = semantic_tags.get("name")

    if not name:
        name = mega_info.get("name", "")

    tags = list(map(lambda x: x.lower().strip(" "), dict.fromkeys(tags)))

    meta = {item["key"]: item["value"] for item in mega_info.get("meta", [])}

    number_pattern = re.compile("\d+(?:\.\d+)?")

    dimensions = {}

    x = meta.get("length")
    if x:
        x = float(number_pattern.search(x).group(0))
    y = meta.get("width")
    if y:
        y = float(number_pattern.search(y).group(0))

    if not x and not y:
        scan_area = meta.get("scanArea")
        if not scan_area:
            sizes = utils.locate_item(mega_info,
                                      "physicalSize",
                                      is_dict_key=True,
                                      return_as='data')
            if sizes:
                scan_area = Counter(sizes).most_common(1)[0][0]
        if scan_area:
            sizes = number_pattern.findall(scan_area)
            if len(sizes) == 2:
                x = float(sizes[0])
                y = float(sizes[1])
            elif len(sizes) == 1:
                x = y = float(sizes[0])

    if x:
        dimensions['x'] = x
    if y:
        dimensions['y'] = y

    z = meta.get("height")
    if z:
        dimensions['z'] = float(number_pattern.search(z).group(0))

    info = {
        # "id": "", # can get a slug from the json listing files
        "name": name,
        "url": f"https://quixel.com/megascans/home?assetId={mega_info['id']}",
        "author": "Quixel Megascans",
        "author_url": "https://quixel.com/megascans",
        "licence": "EULA",
        "licence_url": "https://quixel.com/terms",
        "tags": tags,
        # "preview_url": "", # probably the url is generated by some javascript
        # "description": "", # does not have it
        "dimensions": dimensions,
    }

    utils.remove_empty(info)
    return info
Exemple #7
0
def get_web_ambientcg_info(url, content_folder):

    # https://cc0textures.com/view?id=Plaster003
    # https://ambientcg.com/view?id=Bricks056

    if "cc0textures.com" in url or "ambientcg.com" in url:
        match = re.search(r"(?<=id=)[a-zA-Z0-9]+", url)
        if not match:
            return False, "Not valid Ambient CG url."
        id = match.group(0)
    elif "cc0.link" in url:  # https://cc0.link/a/Plaster003
        url = url.split("?")[0].split("#")[0].rstrip("/")
        id = url.split("/")[-1]

    api_url = f"https://ambientcg.com/api/v2/full_json?id={id}&sort=Latest&limit=1&include=tagData%2CdisplayData%2CdimensionsData%2CdownloadData%2CpreviewData%2CimageData"

    headers = {'User-Agent': 'Blender'}

    import requests
    response = requests.get(api_url, headers=headers)
    if response.status_code != 200:
        return False, response.text

    json = response.json()

    asset = json["foundAssets"][0]

    if asset["dataType"] == "3DModel":
        return False, "3DModel is not supported yet."

    dimensions = {}
    for letter, name in zip('xyz', ("dimensionX", "dimensionY", "dimensionZ")):
        dimension = asset.get(name)
        if dimension:
            dimensions[letter] = int(dimension) / 100

    info = {
        "id": id,
        "name": asset["displayName"],
        "url": f"https://ambientcg.com/view?id={id}",
        "author": "ambientcg",
        "author_url": "https://ambientcg.com",
        "licence": "CC0",
        "licence_url": "https://help.ambientcg.com/01-General/Licensing.html",
        "tags": asset["tags"],
        "preview_url": asset["previewImage"]["1024-PNG"],
        "description": asset.get("description"),
        "dimensions": dimensions
    }

    info['material_settings'] = {'Y- Normal Map': 1}

    if content_folder:
        download = utils.locate_item(asset["downloadFolders"],
                                     ("attribute", "4K-JPG"),
                                     return_as="parent")[0]
        url = download[
            "downloadLink"]  # "https://cc0textures.com/get?file=Plaster003_4K-PNG.zip"
        info["downloadLink"] = url
        info["fileName"] = download["fileName"]  # "Plaster003_4K-PNG.zip"

    utils.remove_empty(info)
    return True, info
Exemple #8
0
def get_web_texturehaven_info(url, content_folder):
    # https://texturehaven.com/tex/?t=brick_wall_003
    url = url.split("#")[0]

    if not "texturehaven.com/tex/" in url:
        return False, "Not valid Texture Haven url."

    match = re.search(r"(?<=t=)[a-zA-Z0-9_]+", url)
    id = match.group(0)

    import requests
    response = requests.get(url)
    if response.status_code != 200:
        return False, response.text

    from bs4 import BeautifulSoup
    soup = BeautifulSoup(response.text, 'html.parser')

    dimensions = {}
    tags = []

    for item in soup.find(name="div", id="item-info").findAll("li"):
        title = item.get("title")

        if not title:
            b = item.find('b')
            if b:
                title = b.string

        if title:

            if title.startswith("Author"):
                author = title.split(":")[1].strip()
                author_url = f"https://texturehaven.com/textures/?a={author}"

            elif title.startswith("Real-world"):
                dimensions_title = title.split(":")[1].strip()
                number_pattern = re.compile("\d+\.?\d*")
                for letter, number in zip(
                        'xyz', number_pattern.findall(dimensions_title)):
                    dimensions[letter] = float(number)

            elif title.startswith(("Categories", "Tags")):
                tags.extend(
                    [a.string.lower().strip() for a in item.findAll("a")])

    preview_url = "https://texturehaven.com" + soup.find(
        name="div", id="item-preview").find("img")["src"]

    info = {
        "id": id,
        "name": id,
        "url": url,
        "author": author,
        "author_url": author_url,
        "licence": "CC0",
        "licence_url": "https://texturehaven.com/p/license.php",
        "tags": tags,
        "preview_url": preview_url,
        # "description": "",
        "dimensions": dimensions,
    }

    utils.remove_empty(info)

    if content_folder:
        downloads = []

        # for a in soup.findAll("a"):
        #     if a.get("download"):
        #         href = a["href"]
        #         if "/png/4k/" in href:
        #             name = href.split("/")[-1].lower()
        #             type = get_type(name)
        #             if type and len(type) == 1 and type[0] in ('diffuse', 'albedo', 'displacement', 'normal', 'roughness', 'ambient_occlusion'):
        #                 downloads.append("https://texturehaven.com" + href)

        for a in soup.findAll("a"):
            if a.get("download"):
                href = a["href"]
                if "/4k/" in href:
                    name = href.split("/")[-1].lower()
                    type = type_definer.get_type(
                        name, config={"is_rgb_plus_alpha": True})
                    if not type or len(type) != 1:
                        continue
                    type = type[0]
                    if ("/jpg/4k/" in href and type in
                        ('diffuse', 'albedo', 'normal', 'roughness',
                         'ambient_occlusion')) or ("/png/4k/" in href and type
                                                   in ('displacement', )):
                        downloads.append("https://texturehaven.com" + href)

        for download in downloads.copy():
            if "dx_normal" in download.lower():
                for _download in downloads.copy():
                    if "gl_normal" in _download.lower():
                        downloads.remove(download)

        downloads = utils.deduplicate(downloads)
        info["downloads"] = downloads

    return True, info
Exemple #9
0
def main(_):
    tf.logging.set_verbosity(_verbosity_levels[tf.flags.FLAGS.verbosity])

    params = {
        'criterion': tf.flags.FLAGS.criterion,
        'max_iter': tf.flags.FLAGS.maxiter,
        'kernel': tf.flags.FLAGS.kernel,
        'bandwidth': tf.flags.FLAGS.bandwidth,
        'n_clusters': tf.flags.FLAGS.nclusters,
        'batch_size': tf.flags.FLAGS.batchsize
    }

    data = generate_random(100, 500)

    # assert os.path.exists(tf.flags.FLAGS.data)

    if tf.flags.FLAGS.method in methods:

        cl = methods[tf.flags.FLAGS.method](**remove_empty(params))

        labels = cl.fit(data)
        centroids = cl.centroids
        history = cl.history

    else:
        history = load(os.path.join(tf.flags.FLAGS.save, 'history.npy'))
        centroids = load(os.path.join(tf.flags.FLAGS.save, 'centroids.npy'))
        labels = load(os.path.join(tf.flags.FLAGS.save, 'labels.npy'))

        if tf.flags.FLAGS.method == 'visualize':
            assert len(history) > 1 \
                   and history[0].shape[0] == labels.shape[0], 'Invalid ' \
                                                               'history'
            plot(history, data, labels, centroids, draw_lines=False)

        elif tf.flags.FLAGS.method == 'visualize_animated':
            assert len(history) > 1 \
                   and history[0].shape[0] == labels.shape[0], 'Invalid ' \
                                                               'history'
            animated_plot(history, labels)

        else:
            raise ValueError('--mode parameter must either '
                             'be < means_shift >'
                             '< mini_batch_mean_shift >,'
                             '< kmeans > or < mini_batch_kmeans >.')

        return

    if history is None:
        tf.logging.warn('Data is too large to visualize.')
    elif data.shape[1] != 2:
        tf.logging.warn('Data must be 2 dimensional to visualize.')
    else:
        tf.logging.info('Creating plot for history visualization.')

        plot(history, data, labels, centroids, draw_lines=False)

        save(os.path.join(tf.flags.FLAGS.save, 'history.npy'), history)
    save(os.path.join(tf.flags.FLAGS.save, 'centroids.npy'), centroids)
    save(os.path.join(tf.flags.FLAGS.save, 'labels.npy'), labels)
Exemple #10
0
if __name__ == '__main__':
    pdf_dir = "/home/mahad/abbyy_dummy_dataset/pdf"
    xml_dir = "/home/mahad/abbyy_dummy_dataset/xml"
    save_dir = "/tmp"
    pdf_files = os.listdir(pdf_dir)
    xml_files = os.listdir(xml_dir)
    for xml_file in xml_files:
        print(xml_file)
        xml_path = os.path.join(xml_dir, xml_file)
        pdf_path = os.path.join(pdf_dir, Path(xml_file).stem + ".pdf")
        xml_data = get_raw_data(xml_path)
        for page in xml_data:
            para_boxes = page["para_boxes"]
            para_texts = page["para_texts"]
            para_boxes, para_texts = remove_empty(para_boxes, para_texts)
            tables = page["tables"]
            table_boxes = [tt["bbox"] for tt in tables]
            table_texts = [tt["rows"] for tt in tables]
            img = pdf2image.convert_from_path(pdf_path, size=(page["width"], page["height"]),
                                              first_page=page["page_number"], last_page=page["page_number"])
            img = np.asarray(img[0])
            all_boxes = para_boxes + table_boxes
            all_texts = para_texts + table_texts
            column_blocks = get_blocks((page["height"], page["width"]), all_boxes)
            column_blocks_merged = merge_blocks(column_blocks, all_boxes)
            ordered_boxes = create_order(column_blocks_merged, all_boxes)
            ordered_texts = []
            for i in range(0, len(ordered_boxes)):
                idx = all_boxes.index(ordered_boxes[i])
                ordered_texts.append(all_texts[idx])