def import_user_item(self, file): with open(file, 'r+') as in_file: reader = csv.reader(in_file, delimiter=',') next(reader, None) with self._driver.session() as session: self.execute_without_exception("CREATE CONSTRAINT ON (u:User) ASSERT u.userId IS UNIQUE") tx = session.begin_transaction() i = 0 for row in reader: try: if row: user_id = strip(row[0]) movie_id = strip(row[1]) rating = strip(row[2]) timestamp = strip(row[3]) query = """ MATCH (movie:Movie {movieId: $movieId}) MERGE (user:User {userId: $userId}) MERGE (user)-[:RATED {rating: $rating, timestamp: $timestamp}]->(movie) """ tx.run(query, {"movieId":movie_id, "userId": user_id, "rating":rating, "timestamp": timestamp}) i += 1 if i == 1000: tx.commit() i = 0 tx = session.begin_transaction() except Exception as e: print(e, row, reader.line_num) tx.commit()
def import_movie_details(self, file): print("Importing details of movies") with open(file, 'r+') as in_file: reader = csv.reader(in_file, delimiter=',') next(reader, None) with self._driver.session() as session: self.execute_without_exception("CREATE CONSTRAINT ON (a:Person) ASSERT a.name IS UNIQUE;") tx = session.begin_transaction() i = 0 j = 0 for row in reader: try: if row: movie_id = strip(row[0]) imdb_id = strip(row[1]) movie = self._ia.get_movie(imdb_id) self.process_movie_info(movie_info=movie, tx=tx, movie_id=movie_id) i += 1 j += 1 if i == 10: tx.commit() print(j, "movie details imported") i = 0 tx = session.begin_transaction() except Exception as e: print(e, row, reader.line_num) tx.commit() print(j, "lines processed")
def get_movie_info(self): while True: row = self._movie_queue.get() with self._print_lock: print("Getting info for row: ", row) movie_id = strip(row[0]) imdb_id = strip(row[1]) # get a movie retry = 0 while retry < 10: try: movie = self._ia.get_movie(imdb_id) with self._print_lock: print("Writing to the other queue: ", movie) self._writing_queue.put([movie_id, movie]) break except: with self._print_lock: print("An error occurred") retry = retry + 1 if retry == 10: with self._print_lock: print("Error while getting", row) else: with self._print_lock: print("Failed...... ", retry) time.sleep(10) self._movie_queue.task_done()
def import_event_data(self, file): with self._driver.session() as session: self.execute_without_exception("CREATE CONSTRAINT ON (u:User) ASSERT u.userId IS UNIQUE") self.execute_without_exception("CREATE CONSTRAINT ON (i:Item) ASSERT i.itemId IS UNIQUE") self.execute_without_exception("CREATE CONSTRAINT ON (t:Time) ASSERT t.value IS UNIQUE") self.execute_without_exception("CREATE CONSTRAINT ON (l:Location) ASSERT l.value IS UNIQUE") self.execute_without_exception("CREATE CONSTRAINT ON (c:Companion) ASSERT c.value IS UNIQUE") j = 0 with open(file, 'r+') as in_file: reader = csv.reader(in_file, delimiter=',') next(reader, None) tx = session.begin_transaction() i = 0 query = """ MERGE (user:User {userId: $userId}) MERGE (time:Time {value: $time}) MERGE (location:Location {value: $location}) MERGE (companion:Companion {value: $companion}) MERGE (item:Item {itemId: $itemId}) CREATE (event:Event {rating:$rating}) CREATE (event)-[:EVENT_USER]->(user) CREATE (event)-[:EVENT_ITEM]->(item) CREATE (event)-[:EVENT_LOCATION]->(location) CREATE (event)-[:EVENT_COMPANION]->(companion) CREATE (event)-[:EVENT_TIME]->(time) """ for row in reader: try: if row: user_id = row[0] item_id = strip(row[1]) rating = strip(row[2]) time = strip(row[3]) location = strip(row[4]) companion = strip(row[5]) tx.run(query, {"userId": user_id, "time": time, "location": location, "companion": companion, "itemId": item_id, "rating": rating}) i += 1 j += 1 if i == 1000: tx.commit() print(j, "lines processed") i = 0 tx = session.begin_transaction() except Exception as e: print(e, row) tx.commit() print(j, "lines processed") print(j, "lines processed")
def import_user_item(self, file): with open(file, 'r+') as in_file: reader = csv.reader(in_file, delimiter=',') next(reader, None) with self._driver.session() as session: self.execute_without_exception( "CREATE CONSTRAINT ON (u:User) ASSERT u.userId IS UNIQUE") self.execute_without_exception( "CREATE CONSTRAINT ON (u:Item) ASSERT u.itemId IS UNIQUE") tx = session.begin_transaction() i = 0 j = 0 query = """ MERGE (item:Item {itemId: $itemId}) MERGE (user:User {userId: $userId}) MERGE (user)-[:PURCHASES { timestamp: $timestamp}]->(item) """ for row in reader: try: if row: timestamp = strip(row[0]) user_id = strip(row[1]) event_type = strip(row[2]) item_id = strip(row[3]) if event_type == "transaction": tx.run( query, { "itemId": item_id, "userId": user_id, "timestamp": timestamp }) i += 1 j += 1 if i == 1000: tx.commit() print(j, "lines processed") i = 0 tx = session.begin_transaction() except Exception as e: print(e, row, reader.line_num) tx.commit() print(j, "lines processed")
def import_movies(self, file): print("import movies") with open(file, 'r+') as in_file: reader = csv.reader(in_file, delimiter=',') next(reader, None) with self._driver.session() as session: self.execute_without_exception("CREATE CONSTRAINT ON (a:Movie) ASSERT a.movieId IS UNIQUE; ") self.execute_without_exception("CREATE CONSTRAINT ON (a:Genre) ASSERT a.genre IS UNIQUE; ") tx = session.begin_transaction() i = 0 j = 0 for row in reader: try: if row: movie_id = strip(row[0]) title = strip(row[1]) genres = strip(row[2]) query = """ CREATE (movie:Movie {movieId: $movieId, title: $title}) with movie UNWIND $genres as genre MERGE (g:Genre {genre: genre}) MERGE (movie)-[:HAS]->(g) """ tx.run(query, {"movieId": movie_id, "title": title, "genres": genres.split("|")}) i += 1 j += 1 if i == 1000: tx.commit() print(j, "movies processed") i = 0 tx = session.begin_transaction() except Exception as e: print(e, row, reader.line_num) tx.commit() print(j, "lines processed")
def import_session_data(self, file): with self._driver.session() as session: self.execute_without_exception("CREATE CONSTRAINT ON (s:Session) ASSERT s.sessionId IS UNIQUE") self.execute_without_exception("CREATE CONSTRAINT ON (i:Item) ASSERT i.itemId IS UNIQUE") dtype = {"sessionID": np.int64, "itemID": np.int64, "category": object} j = 0 for chunk in pd.read_csv(file, header=0, dtype=dtype, names=['sessionID', 'timestamp', 'itemID', 'category'], parse_dates=['timestamp'], chunksize=10 ** 6): df = chunk tx = session.begin_transaction() i = 0 query = """ MERGE (session:Session {sessionId: $sessionId}) MERGE (item:Item {itemId: $itemId, category: $category}) CREATE (click:Click {timestamp: $timestamp}) CREATE (session)-[:CONTAINS]->(click) CREATE (click)-[:IS_RELATED_TO]->(item) """ for row in df.itertuples(): try: timestamp = row.timestamp session_id = row.sessionID category = strip(row.category) item_id = row.itemID tx.run(query, {"sessionId": session_id, "itemId": item_id, "timestamp": str(timestamp), "category": category}) i += 1 j += 1 if i == 10000: tx.commit() print(j, "lines processed") i = 0 tx = session.begin_transaction() except Exception as e: print(e, row) tx.commit() print(j, "lines processed") print(j, "lines processed")
def import_session_data(self, file): dtype = {"sessionID": np.int64, "itemID": np.int64, "category": object} j = 0 sess_clicks = {} for chunk in pd.read_csv(file, header=0, dtype=dtype, names=['sessionID', 'timestamp', 'itemID', 'category'], parse_dates=['timestamp'], chunksize=10 ** 6): df = chunk for row in df.itertuples(): timestamp = time.mktime(row.timestamp.timetuple()) session_id = row.sessionID category = strip(row.category) item_id = row.itemID item = item_id, category, timestamp j += 1 if session_id in sess_clicks: sess_clicks[session_id] += [item] else: sess_clicks[session_id] = [item] print(j, "lines processed") print(j, "lines processed") print("total number of sessions", len(sess_clicks)) # Filter out length <5 sessions for s in list(sess_clicks): if len(sess_clicks[s]) < 5: del sess_clicks[s] for i in list(sess_clicks): sorted_clicks = sorted(sess_clicks[i], key=operator.itemgetter(2)) sess_clicks[i] = [{'itemId': c[0], 'category': c[1], 'timestamp': c[2]} for c in sorted_clicks] # sess_clicks[i] = sorted_clicks print("total number of valid sessions", len(sess_clicks)) print("start db ingestion") with self._driver.session() as session: self.executeNoException(session, "CREATE CONSTRAINT ON (s:Session) ASSERT s.sessionId IS UNIQUE") self.executeNoException(session, "CREATE CONSTRAINT ON (i:Item) ASSERT i.itemId IS UNIQUE") tx = session.begin_transaction() i = 0 j = 0 query = """ CREATE (session:Session {sessionId: $sessionId}) WITH session UNWIND $items as entry MERGE (item:Item {itemId: entry.itemId, category: entry.category}) CREATE (click:Click {timestamp: entry.timestamp}) CREATE (click)-[:IS_RELATED_TO]->(item) CREATE (session)-[:CONTAINS]->(click) """ for session_id in list(sess_clicks): try: items = sess_clicks[session_id] tx.run(query, {"sessionId": session_id, "items": items}) i += 1 j += 1 if i == 2000: tx.commit() print(j, "lines processed") i = 0 tx = session.begin_transaction() except Exception as e: print(e, session_id) try: if session.has_transaction(): tx.commit() except Exception as e: print(e) print(j, "sessions created processed") return sess_clicks