Example #1
0
def threshold_item(item_file,
                   output_file,
                   columns,
                   lower_threshold=1,
                   upper_threshold=np.inf,
                   seed=0):
    """Randomly sample items in item_file in order to limit the number of
    element in each cell to upper_threshold, where a cell is defined
    as a unique value of the specified columns

    """
    np.random.seed(seed)
    # read input file
    with codecs.open(item_file, mode='r', encoding='UTF-8') as inp:
        header = inp.readline()
    db, _, feat_db = database.load(item_file, features_info=True)
    db = pd.concat([feat_db, db], axis=1)
    # group and sample
    with codecs.open(output_file, mode='w', encoding='UTF-8') as out:
        out.write(header)
        for group, df in db.groupby(columns):
            if len(df) >= lower_threshold:
                # shuffle dataframe
                df = df.reindex(np.random.permutation(df.index))
                m = min(upper_threshold, len(df))
                df = df.iloc[:m]
                for i in range(m):
                    out.write(u" ".join([unicode(e)
                                         for e in df.iloc[i]]) + u"\n")
Example #2
0
File: task.py Project: mmmaat/ABXpy
    def __init__(self, db_name, on, across=None, by=None, filters=None,
                 regressors=None, verbose=0):

        self.verbose = verbose
        assert os.path.exists(db_name), ('the item file {0} was not found:'
                                         .format(db_name))

        if across is None:
            across = []
        if by is None:
            by = []
        if filters is None:
            filters = []
        if regressors is None:
            regressors = []

        # check parameters
        # using several 'on' isn't supported by the toolbox
        assert isinstance(on, basestring), \
            'ON attribute must be specified by a string'
        on = [on]
        if isinstance(across, basestring):
            across = [across]
        if isinstance(by, basestring):
            by = [by]

        if verbose:
            print("Verifying input...")

        # open database
        db, db_hierarchy, feat_db = database.load(db_name, features_info=True)

        # check that required columns are present
        cols = set(db.columns)
        message = ' argument is invalid, check that all \
            the provided attributes are defined in the database ' + db_name
        # the argument of issuperset needs to be a list ...
        assert cols.issuperset(on), 'ON' + message
        assert cols.issuperset(across), 'ACROSS' + message
        assert cols.issuperset(by), 'BY' + message
        # FIXME add additional checks, for example that columns
        # in BY, ACROSS, ON are not the same ? (see task structure notes)
        # also that location columns are not used
        for col in cols:
            assert '_' not in col, col + ': you cannot use underscore in \
                column names'
            assert '#' not in col, col + ': you cannot use \'#\' in \
                column names'
        if verbose:
            print("Input verified")

        # if 'by' or 'across' are empty create appropriate dummy columns
        # (note that '#' is forbidden in user names for columns)
        if not by:
            db['#by'] = 0
            by = ['#by']
        if not across:
            db['#across'] = range(len(db))
            across = ['#across']
        # note that this additional columns are not in the db_hierarchy,
        # but I don't think this is problematic

        self.filters = filter_manager.FilterManager(db_hierarchy,
                                                    on, across, by,
                                                    filters)
        self.regressors = regressor_manager.RegressorManager(db,
                                                             db_hierarchy,
                                                             on, across, by,
                                                             regressors)

        self.sampling = False

        # prepare the database for generating the triplets
        self.by_dbs = {}
        self.feat_dbs = {}
        self.on_blocks = {}
        self.across_blocks = {}
        self.on_across_blocks = {}
        self.antiacross_blocks = {}
        by_groups = db.groupby(by)

        if self.verbose > 0:
            display = progress_display.ProgressDisplay()
            display.add('block', 'Preprocessing by block', len(by_groups))

        for by_key, by_frame in by_groups:
            if self.verbose > 0:
                display.update('block', 1)
                display.display()

            # allow to get by values as well as values of other variables
            # that are determined by these
            by_values = dict(by_frame.iloc[0])
            # apply 'by' filters
            if self.filters.by_filter(by_values):
                # get analogous feat_db
                by_feat_db = feat_db.iloc[by_frame.index]
                # drop indexes
                by_frame = by_frame.reset_index(drop=True)
                # reset_index to get an index relative to the 'by' db,
                # the original index could be conserved in an additional
                # 'index' column if necessary by removing the drop=True, but
                # this would add another constraint on the possible column name
                by_feat_db = by_feat_db.reset_index(drop=True)
                # apply generic filters
                by_frame = self.filters.generic_filter(by_values, by_frame)
                self.by_dbs[by_key] = by_frame
                self.feat_dbs[by_key] = by_feat_db
                self.on_blocks[by_key] = self.by_dbs[by_key].groupby(on)
                self.across_blocks[by_key] = self.by_dbs[
                    by_key].groupby(across)
                self.on_across_blocks[by_key] = self.by_dbs[
                    by_key].groupby(on + across)
                if len(across) > 1:
                    self.antiacross_blocks[by_key] = dict()
                    for across_key in (self.across_blocks[by_key]
                                       .groups.iterkeys()):
                        b = True
                        for i, col in enumerate(across):
                            b = b * (by_frame[col] != across_key[i])
                        self.antiacross_blocks[by_key][
                            across_key] = by_frame[b].index

        # store parameters
        self.database = db_name
        self.db = db
        self.db_hierarchy = db_hierarchy
        self.on = on
        self.across = across
        self.by = by

        # determining appropriate numeric type to represent index (currently
        # used only for numpy arrays and h5 storage, might also be used for
        # panda frames)
        types = {}
        for key, db in self.by_dbs.iteritems():
            # len(db)-1 wouldn't work here because there could be missing index
            # due to generic filtering
            n = np.max(db.index.values)
            types[key] = type_fitting.fit_integer_type(n, is_signed=False)

        self.types = types

        # compute some statistics about the task
        self.compute_statistics()
import glob


def threshold_item(db, item_file, upper_threshod, seed=0,
        columns=['phone', 'context', 'talker']):
    np.random.seed(0)
    f = item_file[:-5] + '_upper_threshold_%d' % upper_threshold + '.item'    
    with open(f, 'w') as out:
        out.write('#file onset offset')
        out.write('#' + ' '.join(columns) + '\n')
        for group, df in db.groupby(columns):
            df = df.reindex(np.random.permutation(df.index)) # shuffle dataframe
            m = min(upper_threshold, len(df))
            df = df.iloc[:m]            
            for i in range(m):
                out.write(' '.join([str(e) for e in df.iloc[i]]) + '\n')


for item_file in glob.iglob("*.item"):
    print item_file
    with open(item_file, 'r') as inf:
        header = inf.readline()
    columns = header.split('#')[-1].split()
    print columns
    thresholds = [20]
    db, _, feat_db = database.load(item_file, features_info=True)
    db = pd.concat([feat_db, db], axis=1)
    #print db
    for upper_threshold in thresholds:
        threshold_item(db, item_file, upper_threshold, columns=columns)
Example #4
0
                   item_file,
                   upper_threshod,
                   seed=0,
                   columns=['phone', 'context', 'talker']):
    np.random.seed(0)
    f = item_file[:-5] + '_upper_threshold_%d' % upper_threshold + '.item'
    with open(f, 'w') as out:
        out.write('#file onset offset')
        out.write('#' + ' '.join(columns) + '\n')
        for group, df in db.groupby(columns):
            df = df.reindex(np.random.permutation(
                df.index))  # shuffle dataframe
            m = min(upper_threshold, len(df))
            df = df.iloc[:m]
            for i in range(m):
                out.write(' '.join([str(e) for e in df.iloc[i]]) + '\n')


for item_file in glob.iglob("*.item"):
    print item_file
    with open(item_file, 'r') as inf:
        header = inf.readline()
    columns = header.split('#')[-1].split()
    print columns
    thresholds = [20]
    db, _, feat_db = database.load(item_file, features_info=True)
    db = pd.concat([feat_db, db], axis=1)
    #print db
    for upper_threshold in thresholds:
        threshold_item(db, item_file, upper_threshold, columns=columns)
Example #5
0
def plot_statistics(item_file, stat_file, tonal=False):
    try:
        pp = PdfPages(stat_file)

        min_thresholds = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]
        db, _ = database.load(item_file)

        # ad hoc fix, for latex compatibility
        s1 = set(db['phone'])
        db['phone'] = [phone.replace("_", "-") for phone in db['phone']]
        s2 = set(db['phone'])
        assert len(s1) == len(s2), "Latex compatibility mixup!"

        """ tokens by type """
        distrib = np.array([len(df) for g, df in db.groupby(['phone', 'talker', 'prev-phone', 'next-phone'])])
        plt.figure()
        plt.hist(distrib, 100)
        plt.title('Histogram of the number of phone-talker-context types as a function of the number of tokens')
        plt.yscale('log', nonposy='clip')
        plt.xlabel('Number of tokens')
        plt.ylabel('Number of types')
        pp.savefig()

        thr = 20
        many_tokens = [(g, len(df)) for g, df in db.groupby(['phone', 'talker', 'prev-phone', 'next-phone']) if len(df) >= 150]
        many_tokens.sort(key=lambda e: e[1])  # sort according to length
        many_tokens = many_tokens[-thr:]
        print('{0} Types with the most tokens:\n'.format(thr))
        print(many_tokens)
        #TODO plot this in a text file or in the pdf using smthg like
        # http://stackoverflow.com/questions/4018860/text-box-with-line-wrapping-in-matplotlib

        p = []
        for threshold in min_thresholds:
            p.append(len(np.where(distrib >= threshold)[0])/np.float(len(distrib)))
        plt.figure()
        plt.plot(min_thresholds, p, '.-')
        plt.title('Proportion of types retained as a function of min threshold')
        plt.xlabel('Min threshold')
        plt.ylabel('Proportion of types of items retained')
        pp.savefig()

        """ number of phonetic contexts by phone """
        nb_context = [(phone, len(df.groupby(['prev-phone', 'next-phone']).groups)) for phone, df in db.groupby(['phone'])]
        nb_context.sort(key=lambda e: e[1])
        phones, nb_context = zip(*nb_context)
        plt.figure()
        plt.plot(nb_context, 'o')
        plt.xlabel('Phones')
        plt.ylabel('Number of contexts')
        plt.xticks(np.arange(len(phones)), phones, fontsize=10)
        plt.title('Number of phonetic contexts for each phone')
        #TODO bar plot + autosize x label font
        pp.savefig()

        """ contexts representation """
        nb_possible_context = len(phones)*len(phones)
        nb_context_found = len(set(zip(db['prev-phone'], db['next-phone'])))
        print('global_context_proportion %f' % (nb_context_found/np.float(nb_possible_context)))
        #TODO plot this in a text file or in the pdf using smthg like
        # http://stackoverflow.com/questions/4018860/text-box-with-line-wrapping-in-matplotlib

        """
        def context_coverage(db, min_tresholds, other_bys):
            nb_contexts = [[] for t in min_thresholds]
            for g, df in db.groupby(other_bys):
                gg = df.groupby(other_bys + ['prev-phone', 'next-phone'])
                contexts = [[] for t in min_thresholds]
                for context, df2 in gg:
                    for i, threshold in enumerate(min_thresholds):
                        if len(df2) >= threshold:
                            contexts[i].append(context)
                for i in range(len(min_thresholds)):
                    nb_contexts[i].append(len(set(contexts[i])))
            for i in range(len(min_thresholds)):
                nb_contexts[i] = np.array(nb_contexts[i])
            return nb_contexts
        """

        def nb_x_per_y(db, min_tresholds, x, y, remaining=[]):
            # remaining should correspond to columns used to threshold
            # that are neither in x nor in y
            nb = [[] for t in min_thresholds]
            for _, dfy in db.groupby(y):
                nx = np.zeros(shape=(len(min_thresholds),), dtype=np.int)
                for _, dfx in dfy.groupby(x):
                    if remaining:
                        l = np.array([len(dfr) for _, dfr in dfx.groupby(remaining)])
                        p = [int(any(l >= threshold)) for threshold in min_thresholds]
                    else:
                        p = [int(len(dfx) >= threshold) for threshold in min_thresholds]
                    nx = nx + p
                for i in range(len(min_thresholds)):
                    nb[i].append(nx[i])
            for i in range(len(min_thresholds)):
                nb[i] = np.array(nb[i])
            return nb

        """
        def context_coverage(db, min_tresholds, g1, g2=[]):
            nb_contexts = [[] for t in min_thresholds]
            for g, df in db.groupby(g1):
                gg = df.groupby(g2+['prev-phone', 'next-phone'])
                contexts = [[] for t in min_thresholds]
                for context, df2 in gg:
                    if g2:
                        context = context[-1]
                    for i, threshold in enumerate(min_thresholds):
                        if len(df2) >= threshold:
                            contexts[i].append(context)
                for i in range(len(min_thresholds)):
                    nb_contexts[i].append(len(set(contexts[i])))
            for i in range(len(min_thresholds)):
                nb_contexts[i] = np.array(nb_contexts[i])
            return nb_contexts
        """

        def plot_x_per_y(title, db, min_thresholds, x, y, remaining=[]):
            data = nb_x_per_y(db, min_thresholds, x, y, remaining)
            plt.figure()
            plt.boxplot(data, labels=min_thresholds)
            plt.title(title)
            plt.xlabel('Min threshold')
            pp.savefig()

        plot_x_per_y('Contexts found for each (speaker, phone) among the {0} possible contexts'.format(nb_possible_context),
                    db, min_thresholds,
                    ['prev-phone', 'next-phone'], ['talker', 'phone'])

        plot_x_per_y('Contexts found for each phone among the {0} possible contexts'.format(nb_possible_context),
                    db, min_thresholds,
                    ['prev-phone', 'next-phone'], ['phone'], ['talker'])

        plot_x_per_y('Contexts found for each speaker among the {0} possible contexts'.format(nb_possible_context),
                    db, min_thresholds,
                    ['prev-phone', 'next-phone'], ['talker'], ['phone'])

        plot_x_per_y('Speakers found for each phone',
                    db, min_thresholds,
                    ['talker'], ['phone'], ['prev-phone', 'next-phone'])

        plot_x_per_y('Speakers found for each phone + context',
                    db, min_thresholds,
                    ['talker'], ['phone', 'prev-phone', 'next-phone'])

        if tonal:
            plot_x_per_y('Tones found for each segment + context',
                    db, min_thresholds,
                    ['tone'], ['segment', 'prev-phone', 'next-phone'], ['talker'])

            plot_x_per_y('Tones found for each segment + context + speaker',
                    db, min_thresholds,
                    ['tone'], ['segment', 'prev-phone', 'next-phone', 'talker'])

            plot_x_per_y('Segments found for each tone + context',
                    db, min_thresholds,
                    ['segment'], ['tone', 'prev-phone', 'next-phone'], ['talker'])

            plot_x_per_y('Segments found for each tone + context + speaker',
                    db, min_thresholds,
                    ['segment'], ['tone', 'prev-phone', 'next-phone', 'talker'])

        """ TODO duration of sound/number of phone tokens by speaker, by phone or both """

    finally:
        pp.close()