def enumerate(row: ps.Row, enumerator: FragmentReactionSliceEnumerator, max_cuts: int) -> List[ps.Row]: attachments = AttachmentPoints() fields = row.split("\t") smiles = fields[0] mol = uc.to_mol(smiles) out_rows = [] if mol: for sliced_mol in enumerator.enumerate(mol, cuts=max_cuts): row_dict = { DataframeColumnsEnum.SCAFFOLDS: attachments.remove_attachment_point_numbers(sliced_mol.scaffold_smiles), DataframeColumnsEnum.DECORATIONS: sliced_mol.decorations_smiles, DataframeColumnsEnum.ORIGINAL: sliced_mol.original_smiles, DataframeColumnsEnum.MAX_CUTS: max_cuts} out_rows.append(ps.Row(**row_dict)) return out_rows
def collect_failures( self, row: ps.Row, enumerator: FailingReactionsEnumerator) -> List[ps.Row]: fields = row.split("\t") smiles = fields[0] mol = uc.to_mol(smiles) out_rows = [] if mol: for failed_reaction in enumerator.enumerate( mol, failures_limit=self.configuration.failures_limit): row_dict = { self._columns.REACTION: failed_reaction.reaction_smirks, self._columns.ORIGINAL: failed_reaction.molecule_smiles } print("found failed reaction") out_rows.append(ps.Row(**row_dict)) if self.configuration.failures_limit <= len(out_rows): break return out_rows
return previous_row[-1] movie_data = sys.argv[1] user_data = sys.argv[2] output = sys.argv[3] userId = 112132212 movie = Row("id", "movieName") movie_table = sc.textFile(movie_data + str("/movies.dat")) rating_table = sc.textFile(movie_data + str("/ratings.dat")) user_data_table = sc.textFile(movie_data + str("/users.dat")) new_user = sc.textFile(user_data) movieRDD = movie_table.map(lambda movie: movie.split("::")) ratingDF = (rating_table.map(lambda rating: rating.split("::")).map( lambda rate: (int(rate[0]), int(rate[1]), float(rate[2]))).map( lambda (uid, mid, rate): Rating(uid, mid, rate))).toDF() newUserRDD = new_user.map(lambda movie: movie.split(" ", 1)) joinRDD = movieRDD.cartesian(newUserRDD) joinRDD = (joinRDD.map(lambda (movie, umovie): (movie[0], movie[1], umovie[ 0], umovie[1])).map(lambda (id, movie, urate, umovie): (umovie, ( id, urate, levenshtein(movie, umovie)))).reduceByKey( lambda x1, x2: min(x1, x2, key=lambda x: x[-1]))) userMovie = ( joinRDD.map(lambda (key, value): (userId, value[0], value[1])).map(