コード例 #1
0
def construct_features(df, target='last', value=None):
    '''
    This function constructs features according to the rules specified in Gamberger and Lavrac (2002).
    Each feature is an indicator function that establishes constraints on the values
    of individuals in the dataset. Features are constructed in the following way:
    
    * For discrete attributes, features are A_i=v_ip and A_i!=v_in. The values v_ip and v_in como
    from the domain of attribute A_i considering only positive and negative samples respectively.
    
    * For continuous attributes, features are:
        1 - A_i <= (v_ip+v_in)/2, for each pair of consecutive values v_ip and v_in (in this order), 
        conditioned that v_ip comes from the domain of A_i in positive samples and v_in 
        from the domain of A_i in negative samples.
        
        2 - A_i > (v_in+v_ip)/2, for each pair of consecutive values v_in and v_ip (in this order),
        and v_in and v_ip are values from domain of A_i as described above.
    
    * For integer attributes, features are generated considering the rules for discrete and continuous
    attributes.    
    '''
    
    _target = __dict({'last': df.columns.values.tolist()[-1], 
                           'first':df.columns.values.tolist()[0]})
    target = _target[target]
    if value is None: value = df[target].unique()[0]
    
    pos = df[target]==value    
    
    tmp = df.drop(target,axis=1)
    numeric_cols = tmp.select_dtypes(include=['floating','integer']).columns.values
    categorical_cols =  tmp.select_dtypes(include=['object','category','integer']).columns.values
    del tmp
    
    features = set()
        
    features.update([Feature(col,value,'eq') for col in categorical_cols 
                     for value in df.loc[pos,col].unique()])
    
    features.update([Feature(col,value,'ne') for col in categorical_cols
                     for value in df.loc[~pos,col].unique()])
    
    def chooseOP(e):
        if e: return 'le'
        return 'gt'
    chooseOP = np.vectorize(chooseOP)
    
    for col in numeric_cols:
        tmp = df.sort_values(by=col)
        pos = tmp[target]==value
        indices = np.where(pos != np.roll(pos,-1))[0]        
        if indices[0]==0: indices = indices[1:]
        if indices[-1]==df.shape[0]: indices = indices[:-2]         
        features.update([Feature(col,value,op) for value,op in zip((tmp[col].values[indices] + tmp[col].values[indices+1])/2,
                                                                   chooseOP(pos[indices]))])

    return features    
コード例 #2
0
    def run(self):
        # first download youtube video then generate sports highlights
        if self.isMovie != 'movie':
            try:
                # if download is true download the video
                if self.do_download:
                    yt = youtube(self.url)
                    self.filePath, self.fileName = yt.download()
                obj = highlight_generation(self.filePath, self.fileName)
                status = obj.generate()
                self.update.emit(status, False, None)
            except Exception as e:
                self.update.emit(('_', 'Error occured, Please try again!'),
                                 True, str(e))

        else:
            try:
                f = Feature(self.filePath + self.fileName)
                summarizer = Summarizer(duration=120)
                summarizer.set_feature_extractor(f)
                status = summarizer.summarize()
                self.update.emit(status, False, None)
            except Exception as e:
                self.update.emit(('_', 'Error occured, Please try again!'),
                                 True, str(e))
コード例 #3
0
    def tokenize(self, tokenizer, max_seq_len):

        tokens = tokenizer.tokenize(self.text)  # 分词
        # 转换到 feature: (idx, input_ids, input_mask, segment_ids)
        feature = Feature.make_single(self.example_id, tokens, tokenizer,
                                      max_seq_len)

        return feature, self.label
コード例 #4
0
ファイル: example.py プロジェクト: gmanolia/csqa
    def f(self, tokenizer, max_seq_length):
        tokens1 = tokenizer.tokenize(self.text1)
        tokens2 = tokenizer.tokenize(self.text2)
        tokens3 = tokenizer.tokenize(self.text3)
        tokens4 = tokenizer.tokenize(self.text4)
        tokens5 = tokenizer.tokenize(self.text5)

        feature1 = Feature.make_single(self.idx, tokens1, tokenizer,
                                       max_seq_length)
        feature2 = Feature.make_single(self.idx, tokens2, tokenizer,
                                       max_seq_length)
        feature3 = Feature.make_single(self.idx, tokens3, tokenizer,
                                       max_seq_length)
        feature4 = Feature.make_single(self.idx, tokens4, tokenizer,
                                       max_seq_length)
        feature5 = Feature.make_single(self.idx, tokens5, tokenizer,
                                       max_seq_length)
        # feature = Feature.make(self.idx, tokens1, tokens2, tokenizer, max_seq_length)
        # print(feature1.input_mask)
        return (feature1, feature2, feature3, feature4, feature5)