def __calc_distribution( self, df: DataFrame ) -> dict: """calculate distribution.""" d = {} score_range = [ 0.00, 0.05, 0.10, 0.15, 0.20, 0.25, 0.30, 0.35, 0.40, 0.45, 0.50, 0.55, 0.60, ] for score in score_range: lower = '%.2f' % (score) if lower == '0.60': upper = 1.00 index = str(lower) + ' - 1.00' d[index] = df.where( (df['predicted_score'] >= lower) & (df['predicted_score'] <= upper) ).count() else: upper = '%.2f' % (score + 0.05) index = str(lower) + ' - ' + str(upper) d[index] = df.where( (df['predicted_score'] >= lower) & (df['predicted_score'] < upper) ).count() return d
def embed_vector_to_not_matched_words(self, df: DataFrame, df_vector_filler: DataFrame): not_matched_df = df.where(col('word_vector').isNull()).select( self.sentence_col_id, 'word') df3 = self.assign_alternative_match_word_based_on_lavenshtein( not_matched_df, df_vector_filler) return df3.alias('base').join( df_vector_filler.alias('filler'), df3.match == col('filler' + '.' + self.word_col_name), how='left').select( self.sentence_col_id, col('base' + '.' + 'word').alias('word'), col('filler' + '.' + 'word_vector').alias('word_vector'))
def split(df: DataFrame, start: int or None, end: int or None) -> DataFrame: day_id_col = 'day_id' if not start and not end: return df else: cond1 = f.col(day_id_col) >= start cond2 = f.col(day_id_col) <= end if start and not end: cond = cond1 elif not start and end: cond = cond2 else: cond = cond1 & cond2 return df.where(cond)
def getOnlyForm1MktEqOpt(inputDataFrame: DataFrame) -> DataFrame: outputDataFrame = inputDataFrame.where(pf.col('_c69') == 'FORM-1') return outputDataFrame