Esempio n. 1
0
def encode_label():
    '''
        给对原始的uid 排序,得到有序的pid
    '''
    train, test = access_data.load_raw_data()
    uid_train = train[0].values
    uid_test = test[0].values

    labelencoder = LabelEncoder()
    labelencoder.fit(list(uid_train) + list(uid_test))

    uid_new_train = labelencoder.transform(uid_train)
    uid_new_test = labelencoder.transform(uid_test)

    x, y = pd.DataFrame({
        'uid': uid_new_train,
        'pid': train[1].values
    }), pd.DataFrame({
        'uid': uid_new_test,
        'pid': test[1].values
    })

    addr1 = setting.processed_data_dir + 'uid&pid_train'
    addr2 = setting.processed_data_dir + 'uid&pid_test'

    x.to_pickle(addr1)
    y.to_pickle(addr2)
    return x, y
Esempio n. 2
0
def clean_corpus():
    '''
       清理原始语聊:并且提取符号特征
    '''
    train, test = access_data.load_raw_data()
    train = pd.DataFrame({'corpus': train[6]})
    test = pd.DataFrame({'corpus': test[3]})

    keyword = [
        r'http[0-9a-zA-Z?:=._@%/\-#&\+|]+', r'//@', r'@', r'#', r'【', r'《',
        r'\['
    ]

    for string in keyword:
        reg = re.compile(string)
        train[string] = train['corpus'].map(lambda x: reg.subn(' ', x))
        test[string] = test['corpus'].map(lambda x: reg.subn(' ', x))

        train['corpus'] = train[string].map(lambda x: x[0])
        test['corpus'] = test[string].map(lambda x: x[0])

        train[string] = train[string].map(lambda x: x[1])
        test[string] = test[string].map(lambda x: x[1])

        logging.info('finished cleaning symbol %s' % string)

    return train, test
def encode_label():
    '''
        给对原始的uid 排序,得到有序的pid
    '''
    train,test = access_data.load_raw_data()
    uid_train = train[0].values
    uid_test  = test[0].values

    labelencoder = LabelEncoder()
    labelencoder.fit(list(uid_train)+list(uid_test))

    uid_new_train = labelencoder.transform(uid_train)
    uid_new_test  = labelencoder.transform(uid_test)

    x ,y = pd.DataFrame({'uid':uid_new_train,'pid':train[1].values}) , pd.DataFrame({'uid':uid_new_test,'pid':test[1].values})

    addr1 = setting.processed_data_dir + 'uid&pid_train'
    addr2 = setting.processed_data_dir + 'uid&pid_test'

    x.to_pickle(addr1)
    y.to_pickle(addr2)
    return x,y
def clean_corpus():
    '''
       清理原始语聊:并且提取符号特征
    '''
    train , test = access_data.load_raw_data()
    train = pd.DataFrame({'corpus' :train[6]})
    test  = pd.DataFrame({'corpus' : test[3]})

    keyword = [r'http[0-9a-zA-Z?:=._@%/\-#&\+|]+' ,r'//@',   r'@' ,  r'#' ,  r'【' ,r'《' ,r'\[' ]

    for string in keyword:
        reg = re.compile(string)
        train[string] = train['corpus'].map(lambda x : reg.subn(' ',x))
        test[string] = test['corpus'].map(lambda x : reg.subn(' ',x))

        train['corpus']  = train[string].map(lambda x:x[0])
        test['corpus']   = test[string].map(lambda x:x[0])

        train[string] = train[string].map(lambda x:x[1])
        test[string]  = test[string].map(lambda x:x[1])

        logging.info('finished cleaning symbol %s' % string)

    return train,test