def threshold_item(item_file, output_file, columns, lower_threshold=1, upper_threshold=np.inf, seed=0): """Randomly sample items in item_file in order to limit the number of element in each cell to upper_threshold, where a cell is defined as a unique value of the specified columns """ np.random.seed(seed) # read input file with codecs.open(item_file, mode='r', encoding='UTF-8') as inp: header = inp.readline() db, _, feat_db = database.load(item_file, features_info=True) db = pd.concat([feat_db, db], axis=1) # group and sample with codecs.open(output_file, mode='w', encoding='UTF-8') as out: out.write(header) for group, df in db.groupby(columns): if len(df) >= lower_threshold: # shuffle dataframe df = df.reindex(np.random.permutation(df.index)) m = min(upper_threshold, len(df)) df = df.iloc[:m] for i in range(m): out.write(u" ".join([unicode(e) for e in df.iloc[i]]) + u"\n")
def __init__(self, db_name, on, across=None, by=None, filters=None, regressors=None, verbose=0): self.verbose = verbose assert os.path.exists(db_name), ('the item file {0} was not found:' .format(db_name)) if across is None: across = [] if by is None: by = [] if filters is None: filters = [] if regressors is None: regressors = [] # check parameters # using several 'on' isn't supported by the toolbox assert isinstance(on, basestring), \ 'ON attribute must be specified by a string' on = [on] if isinstance(across, basestring): across = [across] if isinstance(by, basestring): by = [by] if verbose: print("Verifying input...") # open database db, db_hierarchy, feat_db = database.load(db_name, features_info=True) # check that required columns are present cols = set(db.columns) message = ' argument is invalid, check that all \ the provided attributes are defined in the database ' + db_name # the argument of issuperset needs to be a list ... assert cols.issuperset(on), 'ON' + message assert cols.issuperset(across), 'ACROSS' + message assert cols.issuperset(by), 'BY' + message # FIXME add additional checks, for example that columns # in BY, ACROSS, ON are not the same ? (see task structure notes) # also that location columns are not used for col in cols: assert '_' not in col, col + ': you cannot use underscore in \ column names' assert '#' not in col, col + ': you cannot use \'#\' in \ column names' if verbose: print("Input verified") # if 'by' or 'across' are empty create appropriate dummy columns # (note that '#' is forbidden in user names for columns) if not by: db['#by'] = 0 by = ['#by'] if not across: db['#across'] = range(len(db)) across = ['#across'] # note that this additional columns are not in the db_hierarchy, # but I don't think this is problematic self.filters = filter_manager.FilterManager(db_hierarchy, on, across, by, filters) self.regressors = regressor_manager.RegressorManager(db, db_hierarchy, on, across, by, regressors) self.sampling = False # prepare the database for generating the triplets self.by_dbs = {} self.feat_dbs = {} self.on_blocks = {} self.across_blocks = {} self.on_across_blocks = {} self.antiacross_blocks = {} by_groups = db.groupby(by) if self.verbose > 0: display = progress_display.ProgressDisplay() display.add('block', 'Preprocessing by block', len(by_groups)) for by_key, by_frame in by_groups: if self.verbose > 0: display.update('block', 1) display.display() # allow to get by values as well as values of other variables # that are determined by these by_values = dict(by_frame.iloc[0]) # apply 'by' filters if self.filters.by_filter(by_values): # get analogous feat_db by_feat_db = feat_db.iloc[by_frame.index] # drop indexes by_frame = by_frame.reset_index(drop=True) # reset_index to get an index relative to the 'by' db, # the original index could be conserved in an additional # 'index' column if necessary by removing the drop=True, but # this would add another constraint on the possible column name by_feat_db = by_feat_db.reset_index(drop=True) # apply generic filters by_frame = self.filters.generic_filter(by_values, by_frame) self.by_dbs[by_key] = by_frame self.feat_dbs[by_key] = by_feat_db self.on_blocks[by_key] = self.by_dbs[by_key].groupby(on) self.across_blocks[by_key] = self.by_dbs[ by_key].groupby(across) self.on_across_blocks[by_key] = self.by_dbs[ by_key].groupby(on + across) if len(across) > 1: self.antiacross_blocks[by_key] = dict() for across_key in (self.across_blocks[by_key] .groups.iterkeys()): b = True for i, col in enumerate(across): b = b * (by_frame[col] != across_key[i]) self.antiacross_blocks[by_key][ across_key] = by_frame[b].index # store parameters self.database = db_name self.db = db self.db_hierarchy = db_hierarchy self.on = on self.across = across self.by = by # determining appropriate numeric type to represent index (currently # used only for numpy arrays and h5 storage, might also be used for # panda frames) types = {} for key, db in self.by_dbs.iteritems(): # len(db)-1 wouldn't work here because there could be missing index # due to generic filtering n = np.max(db.index.values) types[key] = type_fitting.fit_integer_type(n, is_signed=False) self.types = types # compute some statistics about the task self.compute_statistics()
import glob def threshold_item(db, item_file, upper_threshod, seed=0, columns=['phone', 'context', 'talker']): np.random.seed(0) f = item_file[:-5] + '_upper_threshold_%d' % upper_threshold + '.item' with open(f, 'w') as out: out.write('#file onset offset') out.write('#' + ' '.join(columns) + '\n') for group, df in db.groupby(columns): df = df.reindex(np.random.permutation(df.index)) # shuffle dataframe m = min(upper_threshold, len(df)) df = df.iloc[:m] for i in range(m): out.write(' '.join([str(e) for e in df.iloc[i]]) + '\n') for item_file in glob.iglob("*.item"): print item_file with open(item_file, 'r') as inf: header = inf.readline() columns = header.split('#')[-1].split() print columns thresholds = [20] db, _, feat_db = database.load(item_file, features_info=True) db = pd.concat([feat_db, db], axis=1) #print db for upper_threshold in thresholds: threshold_item(db, item_file, upper_threshold, columns=columns)
item_file, upper_threshod, seed=0, columns=['phone', 'context', 'talker']): np.random.seed(0) f = item_file[:-5] + '_upper_threshold_%d' % upper_threshold + '.item' with open(f, 'w') as out: out.write('#file onset offset') out.write('#' + ' '.join(columns) + '\n') for group, df in db.groupby(columns): df = df.reindex(np.random.permutation( df.index)) # shuffle dataframe m = min(upper_threshold, len(df)) df = df.iloc[:m] for i in range(m): out.write(' '.join([str(e) for e in df.iloc[i]]) + '\n') for item_file in glob.iglob("*.item"): print item_file with open(item_file, 'r') as inf: header = inf.readline() columns = header.split('#')[-1].split() print columns thresholds = [20] db, _, feat_db = database.load(item_file, features_info=True) db = pd.concat([feat_db, db], axis=1) #print db for upper_threshold in thresholds: threshold_item(db, item_file, upper_threshold, columns=columns)
def plot_statistics(item_file, stat_file, tonal=False): try: pp = PdfPages(stat_file) min_thresholds = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10] db, _ = database.load(item_file) # ad hoc fix, for latex compatibility s1 = set(db['phone']) db['phone'] = [phone.replace("_", "-") for phone in db['phone']] s2 = set(db['phone']) assert len(s1) == len(s2), "Latex compatibility mixup!" """ tokens by type """ distrib = np.array([len(df) for g, df in db.groupby(['phone', 'talker', 'prev-phone', 'next-phone'])]) plt.figure() plt.hist(distrib, 100) plt.title('Histogram of the number of phone-talker-context types as a function of the number of tokens') plt.yscale('log', nonposy='clip') plt.xlabel('Number of tokens') plt.ylabel('Number of types') pp.savefig() thr = 20 many_tokens = [(g, len(df)) for g, df in db.groupby(['phone', 'talker', 'prev-phone', 'next-phone']) if len(df) >= 150] many_tokens.sort(key=lambda e: e[1]) # sort according to length many_tokens = many_tokens[-thr:] print('{0} Types with the most tokens:\n'.format(thr)) print(many_tokens) #TODO plot this in a text file or in the pdf using smthg like # http://stackoverflow.com/questions/4018860/text-box-with-line-wrapping-in-matplotlib p = [] for threshold in min_thresholds: p.append(len(np.where(distrib >= threshold)[0])/np.float(len(distrib))) plt.figure() plt.plot(min_thresholds, p, '.-') plt.title('Proportion of types retained as a function of min threshold') plt.xlabel('Min threshold') plt.ylabel('Proportion of types of items retained') pp.savefig() """ number of phonetic contexts by phone """ nb_context = [(phone, len(df.groupby(['prev-phone', 'next-phone']).groups)) for phone, df in db.groupby(['phone'])] nb_context.sort(key=lambda e: e[1]) phones, nb_context = zip(*nb_context) plt.figure() plt.plot(nb_context, 'o') plt.xlabel('Phones') plt.ylabel('Number of contexts') plt.xticks(np.arange(len(phones)), phones, fontsize=10) plt.title('Number of phonetic contexts for each phone') #TODO bar plot + autosize x label font pp.savefig() """ contexts representation """ nb_possible_context = len(phones)*len(phones) nb_context_found = len(set(zip(db['prev-phone'], db['next-phone']))) print('global_context_proportion %f' % (nb_context_found/np.float(nb_possible_context))) #TODO plot this in a text file or in the pdf using smthg like # http://stackoverflow.com/questions/4018860/text-box-with-line-wrapping-in-matplotlib """ def context_coverage(db, min_tresholds, other_bys): nb_contexts = [[] for t in min_thresholds] for g, df in db.groupby(other_bys): gg = df.groupby(other_bys + ['prev-phone', 'next-phone']) contexts = [[] for t in min_thresholds] for context, df2 in gg: for i, threshold in enumerate(min_thresholds): if len(df2) >= threshold: contexts[i].append(context) for i in range(len(min_thresholds)): nb_contexts[i].append(len(set(contexts[i]))) for i in range(len(min_thresholds)): nb_contexts[i] = np.array(nb_contexts[i]) return nb_contexts """ def nb_x_per_y(db, min_tresholds, x, y, remaining=[]): # remaining should correspond to columns used to threshold # that are neither in x nor in y nb = [[] for t in min_thresholds] for _, dfy in db.groupby(y): nx = np.zeros(shape=(len(min_thresholds),), dtype=np.int) for _, dfx in dfy.groupby(x): if remaining: l = np.array([len(dfr) for _, dfr in dfx.groupby(remaining)]) p = [int(any(l >= threshold)) for threshold in min_thresholds] else: p = [int(len(dfx) >= threshold) for threshold in min_thresholds] nx = nx + p for i in range(len(min_thresholds)): nb[i].append(nx[i]) for i in range(len(min_thresholds)): nb[i] = np.array(nb[i]) return nb """ def context_coverage(db, min_tresholds, g1, g2=[]): nb_contexts = [[] for t in min_thresholds] for g, df in db.groupby(g1): gg = df.groupby(g2+['prev-phone', 'next-phone']) contexts = [[] for t in min_thresholds] for context, df2 in gg: if g2: context = context[-1] for i, threshold in enumerate(min_thresholds): if len(df2) >= threshold: contexts[i].append(context) for i in range(len(min_thresholds)): nb_contexts[i].append(len(set(contexts[i]))) for i in range(len(min_thresholds)): nb_contexts[i] = np.array(nb_contexts[i]) return nb_contexts """ def plot_x_per_y(title, db, min_thresholds, x, y, remaining=[]): data = nb_x_per_y(db, min_thresholds, x, y, remaining) plt.figure() plt.boxplot(data, labels=min_thresholds) plt.title(title) plt.xlabel('Min threshold') pp.savefig() plot_x_per_y('Contexts found for each (speaker, phone) among the {0} possible contexts'.format(nb_possible_context), db, min_thresholds, ['prev-phone', 'next-phone'], ['talker', 'phone']) plot_x_per_y('Contexts found for each phone among the {0} possible contexts'.format(nb_possible_context), db, min_thresholds, ['prev-phone', 'next-phone'], ['phone'], ['talker']) plot_x_per_y('Contexts found for each speaker among the {0} possible contexts'.format(nb_possible_context), db, min_thresholds, ['prev-phone', 'next-phone'], ['talker'], ['phone']) plot_x_per_y('Speakers found for each phone', db, min_thresholds, ['talker'], ['phone'], ['prev-phone', 'next-phone']) plot_x_per_y('Speakers found for each phone + context', db, min_thresholds, ['talker'], ['phone', 'prev-phone', 'next-phone']) if tonal: plot_x_per_y('Tones found for each segment + context', db, min_thresholds, ['tone'], ['segment', 'prev-phone', 'next-phone'], ['talker']) plot_x_per_y('Tones found for each segment + context + speaker', db, min_thresholds, ['tone'], ['segment', 'prev-phone', 'next-phone', 'talker']) plot_x_per_y('Segments found for each tone + context', db, min_thresholds, ['segment'], ['tone', 'prev-phone', 'next-phone'], ['talker']) plot_x_per_y('Segments found for each tone + context + speaker', db, min_thresholds, ['segment'], ['tone', 'prev-phone', 'next-phone', 'talker']) """ TODO duration of sound/number of phone tokens by speaker, by phone or both """ finally: pp.close()