df['pre_operation'] = df['pre_operation'].apply(lambda x: list(filter(None, x))) feature = f.columns.tolist() s = df.pre_operation.apply(lambda x: pandas.Series(x)).unstack() df2 = df.join(pandas.DataFrame((s.reset_index(level=0, drop=True)))).rename(columns={0:'feature'}) df2 = df2[['feature','icd9']] df2 = df2[df2.feature.notnull()] df2 = df2[df2.icd9.notnull()] df2['icd9'] = df2['icd9'].apply(lambda x: '#'+str(x)) features.append(df2) print(i) df = pandas.concat(features) df.to_csv('trainingset.csv') ''' #Step 3: create a word2vec model capturing association between pre-operation words ''' df = pandas.read_csv('trainingset.csv') df = df[['feature','icd9']] df = df[df.feature.notnull()] import gensim model = gensim.models.Word2Vec(df.values.tolist(), min_count=1) model.save('model') ''' #Step 4: use the word2vec model to find the associated words and link them back to icd-9 ''' import gensim df_t = pandas.read_csv('trainingset.csv') model = gensim.models.Word2Vec.load('model') f = pandas.read_csv('feature.csv') feature = f.columns.tolist()
print('%i features identified as important:' % nb_features) indices = np.argsort(fsel.feature_importances_)[::-1][:nb_features] for f in range(nb_features): print("%d. feature %s (%f)" % (f + 1, data.columns[2+indices[f]], fsel.feature_importances_[indices[f]])) # XXX : take care of the feature order for f in sorted(np.argsort(fsel.feature_importances_)[::-1][:nb_features]): features.append(data.columns[2+f]) # Deep learning: # create model model = Sequential() model.add(Dense(12, input_dim=54, activation='relu')) model.add(Dense(256, activation='relu')) model.add(Dense(1, activation='sigmoid')) # Compile model model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy']) # Fit the model model.fit(X, y, epochs=10, batch_size=10) # evaluate the model scores = model.evaluate(X, y) print("\n%s: %.2f%%" % (model.metrics_names[1], scores[1]*100)) # Save model model.save('C:/Users/Rahul/Desktop/antivirus_demo-master/deep_calssifier/deep_classifier.h5')