def similor_sort(sourceData, classicData, num): """ :param sourceData: dataframe include :param classicData: classic picture :param num: how many picture to pick out :return: """ start_time = time.time() ref_data = SFrame() for index, row in sourceData.iterrows(): #print row path = row['path'] img = tc.Image(path) ref_data = ref_data.append(SFrame({'path': [path], 'image': [img]})) ref_data = ref_data.add_row_number() # print ref_data query_data = SFrame() for index, row in classicData.iterrows(): path = row['path'] img = tc.Image(path) query_data = query_data.append(SFrame({ 'path': [path], 'image': [img] })) query_data = query_data.add_row_number() model = tc.image_similarity.create(ref_data, label=None, feature=None, model='resnet-50', verbose=True) if num == 0: num = ref_data.num_rows() similar_images = model.query(query_data, k=num) ret_array = np.zeros((query_data.num_rows(), num)) for image in similar_images: ref_label = image['reference_label'] distance = image['distance'] query_label = image['query_label'] ret_array[query_label][ref_label] = distance mean = np.mean(ret_array, axis=0) sourceData.insert(2, 'distance', (mean)) #sort = np.argsort(mean) # print sourceData elapsed_time = time.time() - start_time print("Time elapsed = %d" % (elapsed_time)) return sourceData
def fields_of_study_papers_ids(self, levels=(1, 2, 3)): """ Creates SFrames with each Fields of study PaperIds :param levels: list of fields of study level """ sf = SFrame() for level in tqdm(levels): sf = sf.append(self._create_field_of_study_paper_ids(level)) return sf
def sjr_to_csv(self, regex): sjr_sf = SFrame() for p in self._dataset_dir.glob(regex): if p.suffix == ".csv": y = int(re.match(r'.*([1-3][0-9]{3})', p.name).group(1)) sf = SFrame.read_csv(str(p), delimiter=';') sf['Year'] = y sf = sf.rename({"Total Docs. (%s)" % y: "Total Docs."}) extra_cols = ["Categories"] for c in extra_cols: if c not in sf.column_names(): sf[c] = '' sjr_sf = sjr_sf.append(sf) r_issn = re.compile('(\\d{8})') sjr_sf['Issn'] = sjr_sf['Issn'].apply(lambda i: r_issn.findall(i)) return sjr_sf.stack('Issn', new_column_name='ISSN')