def construct_features(df, target='last', value=None): ''' This function constructs features according to the rules specified in Gamberger and Lavrac (2002). Each feature is an indicator function that establishes constraints on the values of individuals in the dataset. Features are constructed in the following way: * For discrete attributes, features are A_i=v_ip and A_i!=v_in. The values v_ip and v_in como from the domain of attribute A_i considering only positive and negative samples respectively. * For continuous attributes, features are: 1 - A_i <= (v_ip+v_in)/2, for each pair of consecutive values v_ip and v_in (in this order), conditioned that v_ip comes from the domain of A_i in positive samples and v_in from the domain of A_i in negative samples. 2 - A_i > (v_in+v_ip)/2, for each pair of consecutive values v_in and v_ip (in this order), and v_in and v_ip are values from domain of A_i as described above. * For integer attributes, features are generated considering the rules for discrete and continuous attributes. ''' _target = __dict({'last': df.columns.values.tolist()[-1], 'first':df.columns.values.tolist()[0]}) target = _target[target] if value is None: value = df[target].unique()[0] pos = df[target]==value tmp = df.drop(target,axis=1) numeric_cols = tmp.select_dtypes(include=['floating','integer']).columns.values categorical_cols = tmp.select_dtypes(include=['object','category','integer']).columns.values del tmp features = set() features.update([Feature(col,value,'eq') for col in categorical_cols for value in df.loc[pos,col].unique()]) features.update([Feature(col,value,'ne') for col in categorical_cols for value in df.loc[~pos,col].unique()]) def chooseOP(e): if e: return 'le' return 'gt' chooseOP = np.vectorize(chooseOP) for col in numeric_cols: tmp = df.sort_values(by=col) pos = tmp[target]==value indices = np.where(pos != np.roll(pos,-1))[0] if indices[0]==0: indices = indices[1:] if indices[-1]==df.shape[0]: indices = indices[:-2] features.update([Feature(col,value,op) for value,op in zip((tmp[col].values[indices] + tmp[col].values[indices+1])/2, chooseOP(pos[indices]))]) return features
def run(self): # first download youtube video then generate sports highlights if self.isMovie != 'movie': try: # if download is true download the video if self.do_download: yt = youtube(self.url) self.filePath, self.fileName = yt.download() obj = highlight_generation(self.filePath, self.fileName) status = obj.generate() self.update.emit(status, False, None) except Exception as e: self.update.emit(('_', 'Error occured, Please try again!'), True, str(e)) else: try: f = Feature(self.filePath + self.fileName) summarizer = Summarizer(duration=120) summarizer.set_feature_extractor(f) status = summarizer.summarize() self.update.emit(status, False, None) except Exception as e: self.update.emit(('_', 'Error occured, Please try again!'), True, str(e))
def tokenize(self, tokenizer, max_seq_len): tokens = tokenizer.tokenize(self.text) # 分词 # 转换到 feature: (idx, input_ids, input_mask, segment_ids) feature = Feature.make_single(self.example_id, tokens, tokenizer, max_seq_len) return feature, self.label
def f(self, tokenizer, max_seq_length): tokens1 = tokenizer.tokenize(self.text1) tokens2 = tokenizer.tokenize(self.text2) tokens3 = tokenizer.tokenize(self.text3) tokens4 = tokenizer.tokenize(self.text4) tokens5 = tokenizer.tokenize(self.text5) feature1 = Feature.make_single(self.idx, tokens1, tokenizer, max_seq_length) feature2 = Feature.make_single(self.idx, tokens2, tokenizer, max_seq_length) feature3 = Feature.make_single(self.idx, tokens3, tokenizer, max_seq_length) feature4 = Feature.make_single(self.idx, tokens4, tokenizer, max_seq_length) feature5 = Feature.make_single(self.idx, tokens5, tokenizer, max_seq_length) # feature = Feature.make(self.idx, tokens1, tokens2, tokenizer, max_seq_length) # print(feature1.input_mask) return (feature1, feature2, feature3, feature4, feature5)