def get_word_to_ix(): vocabulary = get_vocabulary() if vocabulary: return {word: index for index, word in enumerate(vocabulary)} else: tprint(f'vocabulary is empty') return None
def create_langy_csv(self): output_csv = f'{csv_directory}learning_traces_langy.csv' # Get data and create dataframe tprint(f'getting traces and creating dataframe') traces_list = [] for t in LearningTrace.objects.all(): traces_list.append({ 'frn': t.frn, 'delta': t.delta, 'seen': t.seen, 'interacted': t.interacted, 'tested': t.tested, 'correct': t.correct, 'p_trans': t.p_trans }) df = pd.DataFrame(traces_list) # Display data tprint(f'{df.shape[0]} datapoints:') print(df.head()) # Create csv tprint(f'creating {output_csv}') try: df.to_csv(output_csv, index=False) except: raise CommandError(f'could not create {output_csv}') tprint('done.')
def get_vocabulary(): duolingo_df = None try: duolingo_df = pd.read_csv(duolingo_csv) except Exception: tprint(f'could not read {duolingo_csv}') return None duolingo_words = duolingo_df['frn'].unique().tolist() langy_words = [t.readable_word for t in Translation.objects.all()] langy_words = sorted(list(set(langy_words))) # unique and ordered vocabulary = sorted(np.unique(duolingo_words + langy_words)) tprint(f'vocabulary contains {len(vocabulary)} unique foreign words') return vocabulary
def words_to_embeds(df, word_to_ix, embeddings, verbose=False): # Get embeds for all foreign words if verbose: tprint('getting embeds for foreign words') df['embed'] = df['frn'].apply(get_embed, word_to_ix=word_to_ix, embeddings=embeddings) # Create new feature per embed dimension if verbose: tprint('creating features for word embeds') for i in range(EMBEDDING_DIM): if verbose: tprint(f'embed feature {i+1}/{EMBEDDING_DIM}') df[f'frn_{i}'] = df.apply(get_embed_item, i=i, axis=1) # Drop columns now that foreign words are represented numerically if verbose: tprint('dropping columns') df.drop(['frn', 'embed'], axis=1, inplace=True) return df
def standardise(series, verbose=False, series_mean=None, series_std=None): if verbose: tprint(f'Standardising series: {series.name}') if series_mean == None: series_mean = series.mean() if series_std == None: series_std = series.std() if verbose: tprint(f' mean: {series_mean}') tprint(f' std: {series_std}') series_standardised = (series - series_mean) / series_std return series_standardised
def handle(self, *args, **kwargs): input_csv = f'{csv_directory}learning_traces_duolingo_subset.csv' output_csv = f'{csv_directory}model_input_duolingo_subset.csv' # Read csv and create dataframe tprint(f'reading {input_csv} and creating dataframe') df = None try: df = pd.read_csv(input_csv) except: raise CommandError(f'could not read {input_csv}') # Display data tprint(f'{df.shape[0]} datapoints:') print(df.head()) ################### # Word Embeddings # ################### # Dictionary mapping unique foreign words to indices word_to_ix = get_word_to_ix() # Stores embeddings for all words # Indices from word_to_ix are used to find the embedding for a particular word embeddings = nn.Embedding(len(word_to_ix), EMBEDDING_DIM) # Replace foreign words with embeddings df = words_to_embeds(df, word_to_ix, embeddings, verbose=True) # Display data tprint(f'{df.shape[0]} datapoints:') print(df.head()) ################## # Pre-Processing # ################## # Interquartile range Q1 = df.quantile(0.25) Q3 = df.quantile(0.75) IQR = Q3 - Q1 # Remove outliers / extreme values df = df[~((df < (Q1 - 1.5 * IQR)) | (df > (Q3 + 1.5 * IQR))).any(axis=1)] # Standardisation for delta and interaction statistics only # Not performed on word embeddings df['delta'] = standardise(df['delta'], verbose=True) df['seen'] = standardise(df['seen'], verbose=True) df['interacted'] = standardise(df['interacted'], verbose=True) df['tested'] = standardise(df['tested'], verbose=True) df['correct'] = standardise(df['correct'], verbose=True) # Display data tprint(f'{df.shape[0]} datapoints:') print(df.head()) ####### # CSV # ####### # Create csv tprint(f'creating {output_csv}') try: df.to_csv(output_csv, index=False) except: raise CommandError(f'could not create {output_csv}') tprint('done.')
def create_duolingo_csv(self): # input_csv = f'{csv_directory}learning_traces.13m.csv' # 1.21GB, 13m datapoints input_csv = f'{csv_directory}learning_traces.13m_subset.csv' # 100MB, 1m datapoints output_csv = f'{csv_directory}learning_traces_duolingo_subset.csv' # Read csv and create dataframe tprint(f'reading {input_csv} and creating dataframe') df = None try: df = pd.read_csv(input_csv) except: raise CommandError(f'could not read {input_csv}') # Display data tprint(f'{df.shape[0]} datapoints:') print(df.head()) # Remove columns df.drop([ 'timestamp', 'user_id', 'learning_language', 'ui_language', 'lexeme_id', 'session_seen', 'session_correct' ], axis=1, inplace=True) ################# # lexeme_string # ################# # Transform lexeme_string from lexeme tags to single words # Remove <tag components> # Extract word after first slash / tprint('transforming lexeme_strings to words') df['lexeme_string'].replace(r'<[^>]*>', '', regex=True, inplace=True) df['lexeme_string'] = df['lexeme_string'].str.extract('([^\/]*$)') # Remove datapoints for lemexe_strings with unexpected characters ' and + tprint('removing lemexe_string words with unexpected characters') df = df[~df['lexeme_string'].str.contains(r"['\+]")] ################## # Manage columns # ################## # Add new columns for additional interaction statistics # Duolingo combines (seen, interacted, tested) in practice sessions # Langy recognises these statistics distinctly tprint('adding new columns for Langy interaction statistics') df['interacted'] = df['history_seen'] df['tested'] = df['history_seen'] # Rename columns df.rename(columns={ 'p_recall': 'p_trans', 'lexeme_string': 'frn', 'history_seen': 'seen', 'history_correct': 'correct', }, inplace=True) # Reorder columns df = df[[ 'frn', 'delta', 'seen', 'interacted', 'tested', 'correct', 'p_trans' ]] ########### # p_trans # ########### # Recalculate p_trans for each datapoint # Duolingo p_recall is calculated for each particular session, rather than for user's full history tprint('recalculating p_trans') df['p_trans'] = df['correct'] / df['tested'] # Display data tprint(f'{df.shape[0]} datapoints:') print(df.head()) ####### # CSV # ####### # Create csv tprint(f'creating {output_csv}') try: df.to_csv(output_csv, index=False) except: raise CommandError(f'could not create {output_csv}') tprint('done.')