Esempio n. 1
0

#*******************   Main    ************************************************

#________________________Import Data________________________
train = pd.read_csv('train.tsv', sep = '\t')
train.head()
test = pd.read_csv('test.tsv', sep = '\t',engine = 'python')
combined = pd.concat([train,test])
trainSize = len(train)


#________________________Data Normailization________________________

#removing missing values
Normalizer.missingValues(combined,'brand_name', 'None')
Normalizer.missingValues(combined,'item_description', 'None')
Normalizer.missingValues(combined,'category_name', 'missing')

#force brand_name, category_name, and item_conditon_id value types to be "catergory"
combined['brand_name'] = combined['brand_name'].astype('category') 
combined['category_name'] = combined['category_name'].astype('category')
combined['item_condition_id'] = combined['item_condition_id'].astype('category')

#force shipping and item_descpritom value types to be "string"
combined.shipping = combined['shipping'].astype(str)
combined.item_description = combined.item_description.astype(str)

#removing punctuation from item description
combined.item_description = combined['item_description'].apply(Normalizer.removePunc)