CLOUDSQL_PWD  = sys.argv[4]

BEST_RANK = int(sys.argv[5])
BEST_ITERATION = int(sys.argv[6])
BEST_REGULATION = float(sys.argv[7])

TABLE_ITEMS  = "Accommodation"
TABLE_RATINGS = "Rating"
TABLE_RECOMMENDATIONS = "Recommendation"

# Read the data from the Cloud SQL
# Create dataframes
#[START read_from_sql]
jdbcDriver = 'com.mysql.jdbc.Driver'
jdbcUrl    = 'jdbc:mysql://%s:3306/%s?user=%s&password=%s' % (CLOUDSQL_INSTANCE_IP, CLOUDSQL_NAME, CLOUDSQL_USER, CLOUDSQL_PWD)
dfAccos = sqlContext.load(source='jdbc', driver=jdbcDriver, url=jdbcUrl, dbtable=TABLE_ITEMS)
dfRates = sqlContext.load(source='jdbc', driver=jdbcDriver, url=jdbcUrl, dbtable=TABLE_RATINGS)
#[END read_from_sql]

# Get all the ratings rows of our user
dfUserRatings  = dfRates.filter(dfRates.userId == USER_ID).map(lambda r: r.accoId).collect()
print(dfUserRatings)

# Returns only the accommodations that have not been rated by our user
rddPotential  = dfAccos.rdd.filter(lambda x: x[0] not in dfUserRatings)
pairsPotential = rddPotential.map(lambda x: (USER_ID, x[0]))

#[START split_sets]
rddTraining, rddValidating, rddTesting = dfRates.rdd.randomSplit([6,2,2])
#[END split_sets]
from pyspark.sql.types import StructField
from pyspark.sql.types import StringType

conf = SparkConf().setAppName("app_collaborative")
sc = SparkContext(conf=conf)
sqlContext = SQLContext(sc)

jdbcDriver = 'com.mysql.jdbc.Driver'
jdbcUrl = 'jdbc:mysql://173.194.227.120:3306/recoom?user=root'

USER_ID = 0

# Read the data from the Cloud SQL
# Create dataframes
dfAccos = sqlContext.load(source='jdbc',
                          driver=jdbcDriver,
                          url=jdbcUrl,
                          dbtable='AccommodationT')
dfRates = sqlContext.load(source='jdbc',
                          driver=jdbcDriver,
                          url=jdbcUrl,
                          dbtable='RatingT')

# Get all the ratings rows of our user
dfUserRatings = dfRates.filter(
    dfRates.userId == USER_ID).map(lambda r: r.accoId).collect()
print(dfUserRatings)

# Returns only the accos that have not been rated by our user
rddPotential = dfAccos.rdd.filter(lambda x: x[0] not in dfUserRatings)
pairsPotential = rddPotential.map(lambda x: (USER_ID, x[0]))
Exemple #3
0
    # Returns the pairs (prediction, rating)
    predictionsAndRatings = predictions.join(againstWiRatings).values()

    # Returns the variance
    return sqrt(
        predictionsAndRatings.map(lambda s: (s[0] - s[1])**2).reduce(add) /
        float(sizeAgainst))


#[END how_far]

# Read the data from the Cloud SQL
# Create dataframes
dfRates = sqlContext.load(source='jdbc',
                          driver=jdbcDriver,
                          url=jdbcUrl,
                          dbtable='Rating')

rddUserRatings = dfRates.filter(dfRates.userId == 0).rdd
print(rddUserRatings.count())

# Split the data in 3 different sets : training, validating, testing
# 60% 20% 20%
rddRates = dfRates.rdd
rddTraining, rddValidating, rddTesting = rddRates.randomSplit([6, 2, 2])

#Add user ratings in the training model
rddTraining.union(rddUserRatings)
nbValidating = rddValidating.count()
nbTesting = rddTesting.count()
  againstWiRatings = against.map(lambda x: ((int(x[0]),int(x[1])), int(x[2])) )

  # Make a prediction and map it for later comparison
  # The map has to be ((user,product), rating) not ((product,user), rating)
  predictions = model.predictAll(againstNoRatings).map(lambda p: ( (p[0],p[1]), p[2]) )

  # Returns the pairs (prediction, rating)
  predictionsAndRatings = predictions.join(againstWiRatings).values()

  # Returns the variance
  return sqrt(predictionsAndRatings.map(lambda s: (s[0] - s[1]) ** 2).reduce(add) / float(sizeAgainst))
#[END how_far]

# Read the data from the Cloud SQL
# Create dataframes
dfRates = sqlContext.load(source='jdbc', driver=jdbcDriver, url=jdbcUrl, dbtable='Rating')

rddUserRatings = dfRates.filter(dfRates.userId == 0).rdd
print(rddUserRatings.count())

# Split the data in 3 different sets : training, validating, testing
# 60% 20% 20%
rddRates = dfRates.rdd
rddTraining, rddValidating, rddTesting = rddRates.randomSplit([6,2,2])

#Add user ratings in the training model
rddTraining.union(rddUserRatings)
nbValidating = rddValidating.count()
nbTesting    = rddTesting.count()

print("Training: %d, validation: %d, test: %d" % (rddTraining.count(), nbValidating, rddTesting.count()))
Exemple #5
0
BEST_RANK = int(sys.argv[5])
BEST_ITERATION = int(sys.argv[6])
BEST_REGULATION = float(sys.argv[7])

TABLE_ITEMS = "Accommodation"
TABLE_RATINGS = "Rating"
TABLE_RECOMMENDATIONS = "Recommendation"

# Read the data from the Cloud SQL
# Create dataframes
#[START read_from_sql]
jdbcDriver = 'com.mysql.jdbc.Driver'
jdbcUrl = 'jdbc:mysql://%s:3306/%s?user=%s&password=%s' % (
    CLOUDSQL_INSTANCE_IP, CLOUDSQL_NAME, CLOUDSQL_USER, CLOUDSQL_PWD)
dfAccos = sqlContext.load(source='jdbc',
                          driver=jdbcDriver,
                          url=jdbcUrl,
                          dbtable=TABLE_ITEMS)
dfRates = sqlContext.load(source='jdbc',
                          driver=jdbcDriver,
                          url=jdbcUrl,
                          dbtable=TABLE_RATINGS)
#[END read_from_sql]

# Get all the ratings rows of our user
dfUserRatings = dfRates.filter(
    dfRates.userId == USER_ID).map(lambda r: r.accoId).collect()
print(dfUserRatings)

# Returns only the accommodations that have not been rated by our user
rddPotential = dfAccos.rdd.filter(lambda x: x[0] not in dfUserRatings)
pairsPotential = rddPotential.map(lambda x: (USER_ID, x[0]))
from pyspark.sql.types import StructType
from pyspark.sql.types import StructField
from pyspark.sql.types import StringType

conf = SparkConf().setAppName("app_collaborative")
sc = SparkContext(conf=conf)
sqlContext = SQLContext(sc)

jdbcDriver = 'com.mysql.jdbc.Driver'
jdbcUrl    = 'jdbc:mysql://173.194.227.120:3306/recoom?user=root'

USER_ID = 0

# Read the data from the Cloud SQL
# Create dataframes
dfAccos = sqlContext.load(source='jdbc', driver=jdbcDriver, url=jdbcUrl, dbtable='AccommodationT')
dfRates = sqlContext.load(source='jdbc', driver=jdbcDriver, url=jdbcUrl, dbtable='RatingT')

# Get all the ratings rows of our user
dfUserRatings  = dfRates.filter(dfRates.userId == USER_ID).map(lambda r: r.accoId).collect()
print(dfUserRatings)

# Returns only the accos that have not been rated by our user
rddPotential  = dfAccos.rdd.filter(lambda x: x[0] not in dfUserRatings)
pairsPotential = rddPotential.map(lambda x: (USER_ID, x[0]))


rddTraining, rddValidating, rddTesting = dfRates.rdd.randomSplit([6,2,2])
model = ALS.train(rddTraining, 20, 20, 0.1)

"""
        filtered_list = [list(x) for x in subset_list_filtered]
        for subset in filtered_list:
            for rule in association_rules:
                r_0 = sorted(list(rule[0]))
                r_1 = sorted(list(rule[1]))
                if subset == r_0 and len(subset) > 1 and (
                        'কথা' in subset) and len(r_1) == 1 and rule[2] > 90:
                    print(r_0, '>', r_1, rule[2])
        print('-----------------------------')


# starting point of the program

if __name__ == '__main__':
    # name of the file to read
    input_file_name = sqlContext.load(Rdd1)
    test_file_name = sqlContext.load(Rdd1)
    # minimum support threshold
    minimum_support_threshold = 3
    # minimum confidence threshold
    minimum_confidence_threshold = 90
    # creating the Apriori object
    apriori = Apriori()
    # reading data from the file
    apriori.read_file(input_file_name)
    # executing the apriori algorithm
    print(
        '##########################################################################################'
    )
    print('Training Phase')
    print(
Exemple #8
0
sqc=SQLContext(sc)

#idea is to read the csv directly in to the dataframe of spark

#defining the schema
#msisdn,SongUniqueCode,Duration,Circle,DATE,DNIS,MODE,businesscategory
#9037991838,Hun-14-63767,202,Kolkata,10/1/2014,59090,,HindiTop20

mySchema=sql.types.StructType([
                        sql.types.StructField("msisdn",sql.types.StringType(),False),
                        sql.types.StructField("songid",sql.types.StringType(),False),
                        sql.types.StructField("duration",sql.types.IntegerType(),True),
                        sql.types.StructField("Circle",sql.types.StringType(),True),
                        sql.types.StructField("date",sql.types.StringType(),True),
                        sql.types.StructField("mode",sql.types.StringType(),True),
                        sql.types.StructField("businesscategory",sql.types.StringType(),True)
                        ])

transdf=sqc.load(source="com.databricks.spark.csv",path ="file:///home/loq/sunil/spark/content_data.csv",schema=mySchema)

transdf.take(2)

#reading the testfile way
'''
transrdd=sc.textFile("file:///home/loq/sunil/spark/content_data.csv").\
            map(lambda x: x.split(',')).\
            map(lambda y: sql.Row(msisdn=y[0],songid=y[1],duration=y[2],circle=y[3],businesscategory=y[7]))

print transrdd.take(2)
'''