def Comic(): df = None classes = ['bicycle','bird','car','cat','dog','person'] sets = [('comic','train'),('comic','test')] for base,image_set in sets: path_b = '/media/gonthier/HDD/data/cross-domain-detection/datasets/comic/ImageSets/Main/%s.txt'%(image_set) pd_b = pd.read_csv(path_b,sep=r"\s*",names=['name_img'],dtype=str) for c in classes: pd_b[c] = -1 print(pd_b.head(5)) for index, row in pd_b.iterrows(): i = row['name_img'] path_i = '/media/gonthier/HDD/data/cross-domain-detection/datasets/comic/Annotations/%s.xml'%(i) read_file = voc_eval.parse_rec(path_i) for element in read_file: classe_elt = element['name'] for c in classes: if classe_elt==c: pd_b.loc[pd_b['name_img']==row['name_img'],c] = 1 pd_b['set'] = image_set if df is None: df = pd_b else: df = df.append(pd_b) output_name = path_output + 'comic_all' + '.csv' df.to_csv(output_name) output_name = path_output + 'comic' + '.csv' df.to_csv(output_name)
def WriteDifficultsBoxes(): """ This function will mark as difficult all the tiny objects in the xml files """ size_min = 25*25 # 20*20 Au moins un truc de taille superieur a 17*17 path_b ='/media/gonthier/HDD/data/Wikidata_Paintings/WikiTenLabels/Main/test.txt' path_to_im = '/media/gonthier/HDD/data/Wikidata_Paintings/WikiTenLabels/JPEGImages/' pd_b = pd.read_csv(path_b,sep=r"\s*",names=['item'],dtype=str) for index, row in pd_b.iterrows(): Erase = False i = row['item'] path_i = path_to_im + i +'.jpg' im = cv2.imread(path_i) height = im.shape[0] width = im.shape[1] writer = Writer(path_i, width, height) pathxml = '/media/gonthier/HDD/data/Wikidata_Paintings/WikiTenLabels/Annotations/%s.xml'%(i) read_file = voc_eval.parse_rec(pathxml) for element in read_file: classe_elt = element['name'] bbox = element['bbox'] xmin = bbox[0] ymin = bbox[1] xmax = bbox[2] ymax = bbox[3] area = (xmax -xmin)*(ymax-ymin) if area <= size_min: # Marked as difficult element['difficult']=1 Erase = True writer.addObject(classe_elt, xmin, ymin, xmax, ymax, difficult=1) else: writer.addObject(classe_elt, xmin, ymin, xmax, ymax) if Erase: writer.save(annotation_path=pathxml) print('Modified :',i) return(0)
def Clipart(): df = None classes = ["aeroplane", "bicycle", "bird", "boat", "bottle", "bus", "car", "cat", "chair", "cow", "diningtable", "dog", "horse", "motorbike", "person", "pottedplant", "sheep", "sofa", "train", "tvmonitor"] sets = [('clipart','train'),('clipart','test')] for base,image_set in sets: path_b = '/media/gonthier/HDD/data/cross-domain-detection/datasets/clipart/ImageSets/Main/%s.txt'%(image_set) pd_b = pd.read_csv(path_b,sep=r"\s*",names=['name_img'],dtype=str) for c in classes: pd_b[c] = -1 print(pd_b.head(5)) for index, row in pd_b.iterrows(): i = row['name_img'] path_i = '/media/gonthier/HDD/data/cross-domain-detection/datasets/clipart/Annotations/%s.xml'%(i) read_file = voc_eval.parse_rec(path_i) for element in read_file: classe_elt = element['name'] for c in classes: if classe_elt==c: pd_b.loc[pd_b['name_img']==row['name_img'],c] = 1 pd_b['set'] = image_set if df is None: df = pd_b else: df = df.append(pd_b) output_name = path_output + 'clipart_all' + '.csv' print(df.iloc[[45,46,47]]) df.to_csv(output_name) output_name = path_output + 'clipart' + '.csv' # On remplace les 0 par des 1 ! les cas difficiles par des certitudes for c in classes: df.loc[df[c]==0,c] = 1 print(df.iloc[[45,46,47]]) df.to_csv(output_name) df=pd.read_csv(output_name,dtype=str) print(df.iloc[[45,46,47]])
def Watercolor(): df = None classes = ["bicycle", "bird","car", "cat", "dog", "person"] sets = [('watercolor','train'),('watercolor','test')] for base,image_set in sets: path_b = '/media/gonthier/HDD/data/cross-domain-detection/datasets/watercolor/ImageSets/Main/%s.txt'%(image_set) pd_b = pd.read_csv(path_b,sep=r"\s*",names=['name_img'],dtype=str) for c in classes: pd_b[c] = -1 print(pd_b.head(5)) for index, row in pd_b.iterrows(): i = row['name_img'] path_i = '/media/gonthier/HDD/data/cross-domain-detection/datasets/watercolor/Annotations/%s.xml'%(i) read_file = voc_eval.parse_rec(path_i) for element in read_file: classe_elt = element['name'] for c in classes: if classe_elt==c: pd_b.loc[pd_b['name_img']==row['name_img'],c] = 1 pd_b['set'] = image_set if df is None: df = pd_b else: df = df.append(pd_b) output_name = path_output + 'watercolor_all' + '.csv' print(df.iloc[[45,46,47]]) df.to_csv(output_name) output_name = path_output + 'watercolor' + '.csv' # On remplace les 0 par des 1 ! les cas difficiles par des certitudes for c in classes: df.loc[df[c]==0,c] = 1 print(df.iloc[[45,46,47]]) df.to_csv(output_name) df=pd.read_csv(output_name,dtype=str) print(df.iloc[[45,46,47]])
def StatsOnWikiTenLabels(): annotations_folder = '/media/gonthier/HDD/data/Wikidata_Paintings/WikiTenLabels/Annotations/' path_data = '/media/gonthier/HDD/output_exp/ClassifPaintings/' name_file = path_data + 'WikiTenLabels.csv' classes_a_garder = ['angel','Child_Jesus','crucifixion_of_Jesus','Mary','nudity', 'ruins','Saint_Sebastien'] df = pd.read_csv(name_file,sep=',') df_test = df[df['set']=='test'] df_train = df[df['set']=='train'] df_test[classes_a_garder] = df_test[classes_a_garder] print("For test") df_test['sum'] = df_test[classes_a_garder].sum(axis=1) for i in range(len(classes_a_garder)): print(i,len(np.where(np.array(df_test.as_matrix(['sum']).ravel(),dtype=int)==i)[0])) df_train[classes_a_garder] = df_train[classes_a_garder] print("For train") df_train['sum'] = df_train[classes_a_garder].sum(axis=1) for i in range(len(classes_a_garder)): print(i,len(np.where(np.array(df_train.as_matrix(['sum']).ravel(),dtype=int)==i)[0])) print('Statistiques sur le test set') print(df_test.sum()) print() print('Statistiques sur le train set') print(df_train.sum()) # classes = ['angel', 'beard','capital','Child_Jesus', 'crucifixion_of_Jesus', # 'Mary','nudity', 'ruins','Saint_Sebastien','turban'] # list_elt= os.listdir(annotations_folder) # file_test = '/media/gonthier/HDD/data/Wikidata_Paintings/WikiTenLabels/ImageSets/Main/test.txt' # file = open(file_test,"w") # for elt in list_elt: # elt_wt_jpg = elt.split('.')[0] # str_w = elt_wt_jpg +' \n' # file.write(str_w) # file.close() size_min = 25*25 # 15*15 path_b ='/media/gonthier/HDD/data/Wikidata_Paintings/WikiTenLabels/ImageSets/Main/test.txt' pd_b = pd.read_csv(path_b,sep=r"\s*",names=['item'],dtype=str) dict_elts_total = {} dict_elts_sizemin = {} for c in classes_a_garder: pd_b[c] = 0 dict_elts_total[c] = 0 dict_elts_sizemin[c] = 0 without_class = 0 numberofIm = 0 for index, row in pd_b.iterrows(): numberofIm += 1 i = row['item'] path_i = '/media/gonthier/HDD/data/Wikidata_Paintings/WikiTenLabels/Annotations/%s.xml'%(i) read_file = voc_eval.parse_rec(path_i) with_class = False for element in read_file: classe_elt = element['name'] bbox = element['bbox'] xmin = bbox[0] ymin = bbox[1] xmax = bbox[2] ymax = bbox[3] area = (xmax -xmin)*(ymax-ymin) # print(area) for c in classes_a_garder: if classe_elt==c: # We got an element from with_class = True pd_b.loc[pd_b['item']==row['item'],c] = 1 dict_elts_total[c] += 1 if area > size_min: dict_elts_sizemin[c] += 1 if not(with_class): without_class += 1 print('Statistiques au niveaux du nombre de classes avec les labels dans la partie annotee en detection') print(pd_b.sum()) print('Nombre d instances des differentes classes') num_obj = 0 for c in classes_a_garder: print(c,' : ',dict_elts_total[c]) num_obj+=dict_elts_total[c] print('Nombre d images totales',numberofIm) print('Nombre d instances totales',num_obj) print("Nombre d images sans classes",without_class) print('Nombre d instances des differentes classes avec une taille superieur a :',size_min,'pixels',num_obj) num_obj = 0 for c in classes_a_garder: print(c,' : ',dict_elts_sizemin[c]) num_obj+=dict_elts_sizemin[c] print('Nombre d instances de taille superieur a ',size_min,'pixels',num_obj)
def Stats_and_testFile(): annotations_folder = '/media/gonthier/HDD/data/Wikidata_Paintings/WikiTenLabels/Annotations/' path_data = '/media/gonthier/HDD/output_exp/ClassifPaintings/' name_file = path_data + 'WikiTenLabels.csv' df = pd.read_csv(name_file,sep=',') df_test = df[df['set']=='test'] print('Statistiques sur le test set') print(df_test.sum()) classes = ['angel', 'beard','capital','Child_Jesus', 'crucifixion_of_Jesus', 'Mary','nudity', 'ruins','Saint_Sebastien','turban'] list_elt= os.listdir(annotations_folder) write_test_file = False if write_test_file: file_test = '/media/gonthier/HDD/data/Wikidata_Paintings/WikiTenLabels/Main/ImageSets/test.txt' file = open(file_test,"w") for elt in list_elt: elt_wt_jpg = elt.split('.')[0] str_w = elt_wt_jpg +' \n' file.write(str_w) file.close() size_min = 25*25 # 15*15 path_b ='/media/gonthier/HDD/data/Wikidata_Paintings/WikiTenLabels/ImageSets/Main/test.txt' pd_b = pd.read_csv(path_b,sep=r"\s*",names=['item'],dtype=str) dict_elts_total = {} dict_elts_sizemin = {} for c in classes: pd_b[c] = 0 dict_elts_total[c] = 0 dict_elts_sizemin[c] = 0 for index, row in pd_b.iterrows(): i = row['item'] path_i = '/media/gonthier/HDD/data/Wikidata_Paintings/WikiTenLabels/Annotations/%s.xml'%(i) read_file = voc_eval.parse_rec(path_i) for element in read_file: classe_elt = element['name'] bbox = element['bbox'] xmin = bbox[0] ymin = bbox[1] xmax = bbox[2] ymax = bbox[3] area = (xmax -xmin)*(ymax-ymin) # print(area) for c in classes: if classe_elt==c: # We got an element from pd_b.loc[pd_b['item']==row['item'],c] = 1 dict_elts_total[c] += 1 if area > size_min: dict_elts_sizemin[c] += 1 print('Statistiques au niveaux du nombre de classes avec les labels dans la partie annotee en detection') print(pd_b.sum()) print('Nombre d instances des differentes classes') num_obj = 0 for c in classes: print(c,' : ',dict_elts_total[c]) num_obj+=dict_elts_total[c] print('Nombre d instances totales',num_obj) print('Nombre d instances des differentes classes avec une taille superieur a :',size_min,'pixels',num_obj) num_obj = 0 for c in classes: print(c,' : ',dict_elts_sizemin[c]) num_obj+=dict_elts_sizemin[c] print('Nombre d instances de taille superieur a ',size_min,'pixels',num_obj)
def CASPApaintings(copyFile=False,convertXML=False,copyIm=False): """ Creation of the CASPA paintings dataset for WSOD training and evaluation """ df = None classes = ["bear", "bird", "cat", "cow", "dog", "elephant", "horse", "sheep"] old_names = ["Bear", "bird", "cat", "Cattle Cow Bull", "dog", "elephant", "horse", "sheep shepherd"] old_img_folder = 'Images_Per_Class' default_path_imdb = '/media/gonthier/HDD2/data/' if convertXML: for old_name in old_names : ff = old_name.lower() ff = ff.replace(' ','_') for folder in ['non','realistic']: f = ff +'_'+folder old_path= default_path_imdb + 'CASPApaintings/Annotations/'+f list_xml = glob.glob(old_path+'/*.xml') for elt in list_xml: dst = default_path_imdb + 'CASPApaintings/Annotations/' + elt.split('/')[-1] path_img = default_path_imdb + 'CASPApaintings/JPEGImages/' + elt.split('/')[-1].replace('xml','jpg') # read the old xml file old_tree=ET.parse(elt) root = old_tree.getroot() size = root.find('imagesize') width = int(size.find('ncols').text) height = int(size.find('nrows').text) list_bd = [] for obj in root.iter('object'): cls = obj.find('name').text if cls not in classes: continue #cls_id = classes.index(cls) polygon = obj.find('polygon') points = polygon.iter('pt') xmin = None ymin = None xmax = None ymax = None for i,pt in enumerate(points): x = int(pt.find('x').text) y = int(pt.find('y').text) if i == 0: xmin = x ymin = y elif i ==1: xmax = x elif i==2: ymax = y list_bd += [[cls,xmin,ymin,xmax,ymax]] WriteNewXMLfile(dst,path_img,width,height,list_bd) if copyIm: # Copie of images in the new folders for f_name in old_names: for folder in ['non','realistic']: old_path= default_path_imdb + 'CASPApaintings/' + old_img_folder +'/'+f_name+'/'+folder list_im = glob.glob(old_path+'/*.jpg') for elt in list_im: lower_name_img = elt.split('/')[-1] lower_name_img = lower_name_img.lower() print(lower_name_img) dst = default_path_imdb + 'CASPApaintings/JPEGImages/' + lower_name_img copyfile(elt,dst) if copyFile: # Creation of the train and set file per class df_test= None df_train = None for new_name,old_name in zip(classes,old_names): #print(new_name,old_name) path_c = default_path_imdb + 'CASPApaintings/' + old_img_folder+'/' + old_name + '.txt' pd_c = pd.read_csv(path_c,names=['name_img'],dtype=str, encoding='utf-8') pd_c_train, pd_c_test = train_test_split(pd_c,test_size=0.5, random_state=0) pd_c_train['name_img'] = pd_c_train['name_img'].str.replace('.jpg','') pd_c_train['name_img'] = pd_c_train['name_img'].str.lower() # pd_c_train = pd_c_train.apply((lambda x : x.replace('.jpg','')),axis=0) pd_c_test['name_img'] = pd_c_test['name_img'].str.replace('.jpg','') pd_c_test['name_img'] = pd_c_test['name_img'].str.lower() # pd_c_test =pd_c_test.apply((lambda x : x.replace('.jpg','')),axis=0) path_c_train = default_path_imdb + 'CASPApaintings/ImageSets/Main/' + new_name + '_train.txt' path_c_test = default_path_imdb + 'CASPApaintings/ImageSets/Main/' + new_name + '_test.txt' pd_c_train.to_csv(path_c_train,header=False,index=False) pd_c_test.to_csv(path_c_test,header=False,index=False) #print(pd_c_test.head(5)) if df_train is None: df_train = pd_c_train else: df_train = df_train.append(pd_c_train) if df_test is None: df_test = pd_c_test else: df_test = df_test.append(pd_c_test) df_test = df_test.drop_duplicates(subset ='name_img') df_train = df_train.drop_duplicates(subset ='name_img') df_train = df_train['name_img'] df_test = df_test['name_img'] #print(df_test.head(5)) num_test_drop = 0 for row in df_test.values: name_im = default_path_imdb + 'CASPApaintings/JPEGImages/' + row +'.jpg' if not os.path.isfile(name_im): #print(row,name_im) df_test = df_test[df_test!=row] num_test_drop += 1 num_train_drop = 0 for row in df_train.values: name_im = default_path_imdb + 'CASPApaintings/JPEGImages/' + row +'.jpg' if not os.path.isfile(name_im): print(row,name_im) df_train = df_train[df_train!=row] num_train_drop += 1 print('number of drop images for train :',num_train_drop,'for test',num_test_drop) path_train =default_path_imdb + 'CASPApaintings/ImageSets/Main/train.txt' path_test =default_path_imdb + 'CASPApaintings/ImageSets/Main/test.txt' df_train.to_csv(path_train,header=False,index=False) df_test.to_csv(path_test,header=False,index=False) # Copie of the xml files # Besoin de convertir en Pascal voc format oneNotFound = False list_mising_xml = [] sets = [('CASPApaintings','train'),('CASPApaintings','test')] for base,image_set in sets: path_b = '/media/gonthier/HDD2/data/CASPApaintings/ImageSets/Main/%s.txt'%(image_set) pd_b = pd.read_csv(path_b,names=['name_img'],dtype=str, encoding='utf-8') for c in classes: pd_b[c] = -1 for index, row in pd_b.iterrows(): i = row['name_img'] path_i = '/media/gonthier/HDD2/data/CASPApaintings/Annotations/%s.xml'%(i) try: read_file = voc_eval.parse_rec(path_i) for element in read_file: classe_elt = element['name'] for c in classes: if classe_elt==c: pd_b.loc[pd_b['name_img']==row['name_img'],c] = 1 except FileNotFoundError: print(path_i,'not found') oneNotFound = True list_mising_xml += [i] pd_b['set'] = image_set if df is None: df = pd_b else: df = df.append(pd_b) df = df.drop_duplicates(subset ='name_img') print('Size dataset :',len(df)) print(list_mising_xml) # for i in list_mising_xml: # name_img = default_path_imdb + 'CASPApaintings/JPEGImages/' + i +'.jpg' # plt.figure() # imreadoutput = plt.imread(name_img) # plt.imshow(imreadoutput) # input('wait') c_missing = [['bear','horse'],['bear'],['bear'],['elephant'],['horse'],['horse'],['horse'],['horse'],['bear'],['bear','sheep'],['bear','dog'],['bear'],['bear'],['bear'],['horse']] print('Modification of missing elements') for classes,name in zip(c_missing,list_mising_xml): print(name,df[df['name_img']==name]['set']) for c in classes: df.loc[df['name_img']==name,c] = 1 df.loc[df['name_img']==name,'set'] = 'train' print('Size of datasets : ',len(df),'size train set :',len(df[df['set']=='train']),'size test set :',len(df[df['set']=='test'])) oneNotFound = False df.loc[df['name_img']=='4-cat-and-cattle','cat'] = 1 if not(oneNotFound): output_name = path_output + 'CASPApaintings_all' + '.csv' df.to_csv(output_name,index=False) output_name = path_output + 'CASPApaintings' + '.csv' df.to_csv(output_name,index=False) output_name = default_path_imdb + 'CASPApaintings/ImageSets/Main/CASPApaintings.csv' df.to_csv(output_name,index=False) else: output_name = path_output + 'CASPApaintings_tmp' + '.csv' df.to_csv(output_name,index=False) print('XML not found, not saved') # Il faut extraire les fichiers test et etc output_name_csv = path_output + 'CASPApaintings' + '.csv' print(output_name_csv) df_new = pd.read_csv(output_name_csv,dtype=str) for set in ['train','test']: df_set = df_new[df_new['set']==set] print(set,len(df_set)) all_names = df_set['name_img'].values output_name = default_path_imdb + 'CASPApaintings/ImageSets/Main/'+set+'.txt' np.savetxt(output_name, all_names, delimiter='\n',fmt='%s') for c in classes: df_c = df_set[df_set[c]==str(1)] all_names_c = df_c['name_img'].values output_name = default_path_imdb + 'CASPApaintings/ImageSets/Main/'+c+'_'+set+'.txt' np.savetxt(output_name, all_names_c, delimiter='\n',fmt='%s')