header=True,
    inferSchema=True,
    nullValue='NA')

# Get number of records
print("The data contain %d records." % flights.count(), '\n')

# Remove records with missing 'delay' values
flights = flights.filter('delay IS NOT NULL')

# Create an indexer for carrier categorical feature
indexer = StringIndexer(inputCol="carrier", outputCol='carrier_idx')

# Indexer identifies categories in the data
indexer_model = indexer.fit(flights)

# Indexer creates a new column with numeric index values
flights_indexed = indexer_model.transform(flights)

# Repeat the process for the org categorical feature
flights_indexed = StringIndexer(
    inputCol="org",
    outputCol='org_idx').fit(flights_indexed).transform(flights_indexed)
# Check first five records
flights_indexed.show(5)

# Get number of records
print("The data contain %d records." % flights_indexed.count(), '\n')

spark.stop()
Exemple #2
0
# In[110]:


desidxer_df.describe().show()


# In[115]:


desidxer_df.select("tailnum===NA" || "tailnum === ''")


# In[118]:


desidxer_df.count()


# In[120]:


df3= desidxer_df.drop()


# In[123]:


df3.count()


# In[ ]: