Ejemplo n.º 1
0
def get_word_to_ix():
    vocabulary = get_vocabulary()
    if vocabulary:
        return {word: index for index, word in enumerate(vocabulary)}
    else:
        tprint(f'vocabulary is empty')
        return None
Ejemplo n.º 2
0
    def create_langy_csv(self):
        output_csv = f'{csv_directory}learning_traces_langy.csv'

        # Get data and create dataframe
        tprint(f'getting traces and creating dataframe')
        traces_list = []
        for t in LearningTrace.objects.all():
            traces_list.append({
                'frn': t.frn,
                'delta': t.delta,
                'seen': t.seen,
                'interacted': t.interacted,
                'tested': t.tested,
                'correct': t.correct,
                'p_trans': t.p_trans
            })
        df = pd.DataFrame(traces_list)

        # Display data
        tprint(f'{df.shape[0]} datapoints:')
        print(df.head())

        # Create csv
        tprint(f'creating {output_csv}')
        try:
            df.to_csv(output_csv, index=False)
        except:
            raise CommandError(f'could not create {output_csv}')

        tprint('done.')
Ejemplo n.º 3
0
def get_vocabulary():
    duolingo_df = None

    try:
        duolingo_df = pd.read_csv(duolingo_csv)
    except Exception:
        tprint(f'could not read {duolingo_csv}')
        return None
    duolingo_words = duolingo_df['frn'].unique().tolist()

    langy_words = [t.readable_word for t in Translation.objects.all()]
    langy_words = sorted(list(set(langy_words)))  # unique and ordered

    vocabulary = sorted(np.unique(duolingo_words + langy_words))
    tprint(f'vocabulary contains {len(vocabulary)} unique foreign words')

    return vocabulary
Ejemplo n.º 4
0
def words_to_embeds(df, word_to_ix, embeddings, verbose=False):
    # Get embeds for all foreign words
    if verbose: tprint('getting embeds for foreign words')
    df['embed'] = df['frn'].apply(get_embed,
                                  word_to_ix=word_to_ix,
                                  embeddings=embeddings)

    # Create new feature per embed dimension
    if verbose: tprint('creating features for word embeds')
    for i in range(EMBEDDING_DIM):
        if verbose: tprint(f'embed feature {i+1}/{EMBEDDING_DIM}')
        df[f'frn_{i}'] = df.apply(get_embed_item, i=i, axis=1)

    # Drop columns now that foreign words are represented numerically
    if verbose: tprint('dropping columns')
    df.drop(['frn', 'embed'], axis=1, inplace=True)

    return df
Ejemplo n.º 5
0
def standardise(series, verbose=False, series_mean=None, series_std=None):
    if verbose: tprint(f'Standardising series: {series.name}')

    if series_mean == None:
        series_mean = series.mean()
    if series_std == None:
        series_std = series.std()

    if verbose:
        tprint(f'   mean: {series_mean}')
        tprint(f'   std: {series_std}')

    series_standardised = (series - series_mean) / series_std
    return series_standardised
Ejemplo n.º 6
0
    def handle(self, *args, **kwargs):
        input_csv = f'{csv_directory}learning_traces_duolingo_subset.csv'
        output_csv = f'{csv_directory}model_input_duolingo_subset.csv'

        # Read csv and create dataframe
        tprint(f'reading {input_csv} and creating dataframe')
        df = None
        try:
            df = pd.read_csv(input_csv)
        except:
            raise CommandError(f'could not read {input_csv}')

        # Display data
        tprint(f'{df.shape[0]} datapoints:')
        print(df.head())

        ###################
        # Word Embeddings #
        ###################

        # Dictionary mapping unique foreign words to indices
        word_to_ix = get_word_to_ix()

        # Stores embeddings for all words
        # Indices from word_to_ix are used to find the embedding for a particular word
        embeddings = nn.Embedding(len(word_to_ix), EMBEDDING_DIM)

        # Replace foreign words with embeddings
        df = words_to_embeds(df, word_to_ix, embeddings, verbose=True)

        # Display data
        tprint(f'{df.shape[0]} datapoints:')
        print(df.head())

        ##################
        # Pre-Processing #
        ##################

        # Interquartile range
        Q1 = df.quantile(0.25)
        Q3 = df.quantile(0.75)
        IQR = Q3 - Q1

        # Remove outliers / extreme values
        df = df[~((df < (Q1 - 1.5 * IQR))
                  | (df > (Q3 + 1.5 * IQR))).any(axis=1)]

        # Standardisation for delta and interaction statistics only
        # Not performed on word embeddings
        df['delta'] = standardise(df['delta'], verbose=True)
        df['seen'] = standardise(df['seen'], verbose=True)
        df['interacted'] = standardise(df['interacted'], verbose=True)
        df['tested'] = standardise(df['tested'], verbose=True)
        df['correct'] = standardise(df['correct'], verbose=True)

        # Display data
        tprint(f'{df.shape[0]} datapoints:')
        print(df.head())

        #######
        # CSV #
        #######

        # Create csv
        tprint(f'creating {output_csv}')
        try:
            df.to_csv(output_csv, index=False)
        except:
            raise CommandError(f'could not create {output_csv}')

        tprint('done.')
Ejemplo n.º 7
0
    def create_duolingo_csv(self):
        # input_csv = f'{csv_directory}learning_traces.13m.csv'  # 1.21GB, 13m datapoints
        input_csv = f'{csv_directory}learning_traces.13m_subset.csv'  # 100MB, 1m datapoints
        output_csv = f'{csv_directory}learning_traces_duolingo_subset.csv'

        # Read csv and create dataframe
        tprint(f'reading {input_csv} and creating dataframe')
        df = None
        try:
            df = pd.read_csv(input_csv)
        except:
            raise CommandError(f'could not read {input_csv}')

        # Display data
        tprint(f'{df.shape[0]} datapoints:')
        print(df.head())

        # Remove columns
        df.drop([
            'timestamp', 'user_id', 'learning_language', 'ui_language',
            'lexeme_id', 'session_seen', 'session_correct'
        ],
                axis=1,
                inplace=True)

        #################
        # lexeme_string #
        #################

        # Transform lexeme_string from lexeme tags to single words
        # Remove <tag components>
        # Extract word after first slash /
        tprint('transforming lexeme_strings to words')
        df['lexeme_string'].replace(r'<[^>]*>', '', regex=True, inplace=True)
        df['lexeme_string'] = df['lexeme_string'].str.extract('([^\/]*$)')

        # Remove datapoints for lemexe_strings with unexpected characters ' and +
        tprint('removing lemexe_string words with unexpected characters')
        df = df[~df['lexeme_string'].str.contains(r"['\+]")]

        ##################
        # Manage columns #
        ##################

        # Add new columns for additional interaction statistics
        # Duolingo combines (seen, interacted, tested) in practice sessions
        # Langy recognises these statistics distinctly
        tprint('adding new columns for Langy interaction statistics')
        df['interacted'] = df['history_seen']
        df['tested'] = df['history_seen']

        # Rename columns
        df.rename(columns={
            'p_recall': 'p_trans',
            'lexeme_string': 'frn',
            'history_seen': 'seen',
            'history_correct': 'correct',
        },
                  inplace=True)

        # Reorder columns
        df = df[[
            'frn', 'delta', 'seen', 'interacted', 'tested', 'correct',
            'p_trans'
        ]]

        ###########
        # p_trans #
        ###########

        # Recalculate p_trans for each datapoint
        # Duolingo p_recall is calculated for each particular session, rather than for user's full history
        tprint('recalculating p_trans')
        df['p_trans'] = df['correct'] / df['tested']

        # Display data
        tprint(f'{df.shape[0]} datapoints:')
        print(df.head())

        #######
        # CSV #
        #######

        # Create csv
        tprint(f'creating {output_csv}')
        try:
            df.to_csv(output_csv, index=False)
        except:
            raise CommandError(f'could not create {output_csv}')

        tprint('done.')