Ejemplo n.º 1
0
    def test_inference(self):
        model_name = "openai/clip-vit-base-patch32"
        model = CLIPModel.from_pretrained(model_name).to(torch_device)
        processor = CLIPProcessor.from_pretrained(model_name)

        image = prepare_img()
        inputs = processor(text=["a photo of a cat", "a photo of a dog"],
                           images=image,
                           padding=True,
                           return_tensors="pt").to(torch_device)

        # forward pass
        with torch.no_grad():
            outputs = model(**inputs)

        # verify the logits
        self.assertEqual(
            outputs.logits_per_image.shape,
            torch.Size(
                (inputs.pixel_values.shape[0], inputs.input_ids.shape[0])),
        )
        self.assertEqual(
            outputs.logits_per_text.shape,
            torch.Size(
                (inputs.input_ids.shape[0], inputs.pixel_values.shape[0])),
        )

        expected_logits = torch.tensor([[24.5701, 19.3049]],
                                       device=torch_device)

        self.assertTrue(
            torch.allclose(outputs.logits_per_image,
                           expected_logits,
                           atol=1e-3))
Ejemplo n.º 2
0
    def __init__(self, path_or_model, **kwargs):
        super().__init__()
        # todo: its easier to let pl save everything so it can also restore with .load_from_checkpoint
        # then, instead of passing a model or path, we should pass a config to construct a blank clip model,
        # then use load weights to get the weights
        self.save_hyperparameters(ignore="path_or_model")

        if isinstance(path_or_model, str):
            path = path_or_model
            model = CLIPModel.from_pretrained(path)
        elif isinstance(path_or_model, CLIPModel):
            model = path_or_model

        model.logit_scale = nn.Parameter(
            torch.tensor(
                self.hparams.logit_scale_init,
                device=model.logit_scale.device,
                dtype=model.logit_scale.dtype,
            ))

        self.model = model
Ejemplo n.º 3
0
from seesaw.roi_extractor import AgnosticRoIExtractor
from seesaw.roi_extractor import to_dataframe
import tensorflow as tf
import random
import matplotlib.pyplot as plt

# IMPORT MODELS
maskrcnn_model = torchvision.models.detection.maskrcnn_resnet50_fpn(
    pretrained=True)
maskrcnn_model.eval()

roi_extractor = AgnosticRoIExtractor(maskrcnn_model)
roi_extractor.eval()

clip_model = CLIPModel.from_pretrained(
    "/home/gridsan/groups/fastai/omoll/seesaw_root2/models/clip-vit-base-patch32/"
)
clip_processor = CLIPProcessor.from_pretrained(
    "/home/gridsan/groups/fastai/omoll/seesaw_root2/models/clip-vit-base-patch32/"
)


def run_clip_proposal(image, boxes, padding):
    images, new_boxes = image_clipper(image, boxes, padding)
    inputs = clip_processor.feature_extractor(images=images,
                                              return_tensors="pt")
    vision_outputs = clip_model.vision_model(**inputs)
    image_embeds = vision_outputs[1]
    image_embeds = clip_model.visual_projection(image_embeds)
    image_embeds = image_embeds / image_embeds.norm(dim=-1, keepdim=True)
    return image_embeds
Ejemplo n.º 4
0
 def test_model_from_pretrained(self):
     for model_name in CLIP_PRETRAINED_MODEL_ARCHIVE_LIST[:1]:
         model = CLIPModel.from_pretrained(model_name)
         self.assertIsNotNone(model)
Ejemplo n.º 5
0
from sentence_transformers import SentenceTransformer, util, models
from PIL import ImageFile, Image
import numpy as np
import requests




###########

image = Image.open('two_dogs_in_snow.jpg')

from transformers import CLIPProcessor, CLIPModel

model = CLIPModel.from_pretrained("openai/clip-vit-base-patch32")
processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32")



inputs = processor(texts=["a cat", "a dog"], images=[image], return_tensors="pt", padding=True)
output = model(**inputs)
#vision_outputs = model.vision_model(pixel_values=inputs['pixel_values'])
#image_embeds = model.visual_projection(vision_outputs[1])

#print(image_embeds.shape)
#exit()



#Load CLIP model
clip = models.CLIPModel()
Ejemplo n.º 6
0
def preprocess_roi_dataset(
    sds: SeesawDatasetManager,
    output_path,
    clip_model_path=None,
    cpu=False,
    image_limiter=None,
    box_limiter=100,
    padding=5,
):
    if (not cpu) and torch.cuda.is_available():
        device = torch.device("cuda")
        print("USING GPU")
    else:
        device = torch.device("cpu")
        print("Using CPU")
    dataset = sds.get_pytorch_dataset()
    output_path = resolve_path(output_path)
    assert not os.path.exists(output_path), "output path already exists"
    clip = False
    if (clip_model_path):
        clip = True
        clip_model_path = resolve_path(clip_model_path)
        assert os.path.exists(clip_model_path), "clip model path doesn't exist"

    dirname = os.path.basename(output_path)
    dirpath = os.path.dirname(output_path)
    output_path = f"{dirpath}/.tmp.{dirname}"
    final_output_path = f"{dirpath}/{dirname}"

    os.makedirs(dirpath, exist_ok=True)

    if os.path.exists(output_path):  # remove old tmpfile
        shutil.rmtree(output_path)

    os.makedirs(output_path)
    '''
    vector_path = f"{output_path}/vectors"
    os.makedirs(vector_path)

    model_link = f"{output_path}/model"
    os.symlink(model_path, model_link)

    dataset_link = f"{output_path}/dataset"
    os.symlink(sds.dataset_root, dataset_link)

    real_prefix = f"{os.path.realpath(sds.image_root)}/"
    read_paths = ((real_prefix + sds.paths)).tolist()
    read_paths = [os.path.normpath(p) for p in read_paths]
    meta_dict = dict(zip(read_paths, zip(sds.paths, np.arange(len(sds.paths)))))
    '''

    maskrcnn_model = torchvision.models.detection.maskrcnn_resnet50_fpn(
        pretrained=True).to(device)
    maskrcnn_model.eval()

    roi_extractor = AgnosticRoIExtractor(maskrcnn_model).to(device)
    roi_extractor.eval()
    roi_extractor.model.rpn.min_size = 10
    roi_extractor.model.rpn.nms_thresh = 0

    clip_model = CLIPModel.from_pretrained(clip_model_path).to(device)
    clip_processor = CLIPProcessor.from_pretrained(clip_model_path)
    print("Models defined")
    ims = []
    paths = []
    with torch.no_grad():
        for i in range(len(dataset)):
            if i % 2000 == 0:
                if i != 0:
                    ans = list(zip(paths, output))
                    df = to_dataframe(ans)
                    df['dbidx'] = dbidx
                    if clip:
                        df['clip_feature'] = clip_features
                    #clip_array = run_clip_on_proposal()
                    #df.assign(clip_feature_vector=TensorArray(clip_array))
                    df.to_parquet(output_path + "/" + str(i) + ".parquet")
                clip_features = []
                output = []
                paths = []
                dbidx = []

            data = dataset[i]
            ims.append(data['image'])
            paths.append(data['file_path'])
            images = torchvision.transforms.ToTensor()(
                data['image']).unsqueeze(0).to(device)
            print("starting roi")
            a = roi_extractor(images)[0]
            if a['scores'].shape[0] > box_limiter:
                a['boxes'] = torch.split(a['boxes'], box_limiter)[0]
                a['scores'] = torch.split(a['scores'], box_limiter)[0]
                a['features'] = torch.split(a['features'].detach(),
                                            box_limiter)[0]
            dbidx.extend([i] * len(a['scores']))
            if clip:
                clip_array = run_clip_proposal(data['image'], a['boxes'],
                                               padding, clip_model,
                                               clip_processor, device)
                a['clip_feature_vector'] = clip_array
                clip_features += clip_array.tolist()
            output.append(a)
            print(i)

        ans = list(zip(paths, output))
        df = to_dataframe(ans)
        df['dbidx'] = dbidx
        if clip:
            df['clip_feature'] = clip_features
        #clip_array = run_clip_on_proposal()
        #df.assign(clip_feature_vector=TensorArray(clip_array))
        df.to_parquet(output_path + "/final.parquet")

        os.rename(output_path, final_output_path)
Ejemplo n.º 7
0
    scheduler = ASHAScheduler(
        max_t=max(grace_period, args.max_epochs),
        grace_period=grace_period,
        reduction_factor=2,
    )

    reporter = CLIReporter(
        parameter_columns=[],
        metric_columns=[
            metric,
            "training_iteration",
        ],
        max_report_frequency=60,
    )

    base_model = CLIPModel.from_pretrained(
        f"/home/gridsan/omoll/xmodexp/notebooks/models/clip-vit-base-patch32")
    init_config = base_model.config
    init_state_dict = base_model.state_dict()

    processor = CLIPProcessor.from_pretrained(
        f"/home/gridsan/omoll/xmodexp/notebooks/models/clip-vit-base-patch32")

    trainable = make_trainable(
        num_epochs=args.max_epochs,
        gpus_per_trial=gpus_per_trial,
        dataset=bird_dataset,
        init_config=init_config,
        init_state_dict=init_state_dict,
        processor=processor,
    )