def __init__(self, file): self.open(file) self.regions = [] self.group = {} self.last_chunk = False self.curr = -1 self.in_region = -1 self.in_group = False line = self.read() while len(line) > 0: self.parse_line(line) line = self.read() # complete samples info for i, region in enumerate(self.regions): region.validate() lo = notes.index(region['lokey']) hi = notes.index(region['hikey']) region['notes'] = range(lo, hi + 1) self.collect_samples() used_regions = [] unused_regions = [] for i, region in enumerate(self.regions): if self.is_region_used(i): region.load_audio() region['delta_sample'] = tempdir + str(time.clock()) + '.dat' region['sample_length'] = len(region['sample_data']) * region['channels'] region['sample_data'].T.flatten().tofile(region['delta_sample'], format='f') region['sample_data'] = '' del region['sample_data'] used_regions.append(region) else: unused_regions.append(i) self.regions = used_regions if unused_regions and OPTIONS['verbose']: wrap("/"*10 + ' Notice: some samples are not used, skipping:') wrap(", ".join([str(i+1) for i in unused_regions])) self.options = {} for region in self.regions: self.options.update(region) self.collect_samples(do_print=False)
def process_row(self, frac_count=0): if self.tokenizer == 'regex': cur_sl_row = [strip_tags(s, 'sl', space=True) for s in self.am_row] for i in range(len(self.am_row)): if self.am_row[i].count('/') > 1: sl = strip_tags(self.am_row[i], 'sl') tl = strip_tags(self.dm_row[i], 'tl') self.process_lu_internal(sl, tl, i, cur_sl_row, frac_count) elif self.tokenizer == 'biltrans': cur_sl_row = [x['sl'] for x in self.am_row] for i in range(len(self.am_row)): if len(self.am_row[i]['tls']) > 1: sl = self.am_row[i]['sl'] tl = self.dm_row[i]['tls'][0] if self.biltrans_wrap_lus: sl = common.wrap(sl) tl = common.wrap(tl) self.process_lu_internal(sl, tl, i, cur_sl_row, frac_count)
def biltrans_extract_frac_freq(biltrans_ambig, biltrans_annotated): c = Counter() c.read_files( biltrans_ambig, # File with ambiguous biltrans output biltrans_annotated) # File with disambiguated biltrans output for sl in c.sl_tl: newtl = sorted(c.sl_tl[sl], key=lambda x: c.sl_tl[sl][x]) newtl.reverse() first = True for tl in newtl: if first: print('%.10f %s %s @' % (c.sl_tl[sl][tl], common.wrap(sl), common.wrap(tl))) first = False else: print('%.10f %s %s' % (c.sl_tl[sl][tl], common.wrap(sl), common.wrap(tl)))
def collect_samples(self, do_print=True): self.overlapping = [] self.ignored = [] for i, region in enumerate(self.regions): for note in region['notes']: if note < len(self.notes_samples) and note > -1: if self.notes_samples[note] != -1: self.overlapping.append(notes[note]) self.notes_samples[note] = i else: self.ignored.append(notes[note]) if do_print and OPTIONS['verbose']: if len(self.overlapping) > 0: wrap("/"*10 + " Notice: some regions are overlapping and would be overwritten") wrap(", ".join(self.overlapping)) if len(self.ignored) > 0: wrap("/"*10 + " Notice: some notes are out of range and ignored") wrap(", ".join(set(self.ignored)))
sl_tl = defaultdict(list) features = {} # features[(slword, ['a', 'list'], tlword)] = 3 indexes = {} trad_counter = defaultdict(lambda: 0) # First read in the frequency defaults for line in open(sys.argv[1]): line = line.strip() if len(line) < 1: continue row = common.tokenize_tagger_line(line) sl = common.wrap(row[0]) tl = common.wrap(row[1]) if tl[1] == '*': tl = tl[:-3] + '$' indexes[(sl, tl)] = trad_counter[sl] trad_counter[sl] += 1 sl_tl[sl].append(tl) if line.count('@') > 0: sl_tl_defaults[sl] = tl class Counter(BCC.BiltransCounter): tokenizer = 'biltrans' line_ids = True
sl_tl[sl] = {}; #} if tl not in sl_tl[sl]: #{ sl_tl[sl][tl] = 0.0; #} sl_tl[sl][tl] = sl_tl[sl][tl] + frac_count; #} #} dm_line = dm_file.readline(); if dm_line == '': break; current_dm_line_id = int(dm_line.split('.[][')[1].split(' ')[0]); #} #} for sl in sl_tl: #{ newtl = sorted(sl_tl[sl], key=lambda x: sl_tl[sl][x]) newtl.reverse() first = True; for tl in newtl: #{ if first: #{ print('%.10f %s %s @' % (sl_tl[sl][tl] , common.wrap(sl) , common.wrap(tl))); first = False else: #{ print('%.10f %s %s' % (sl_tl[sl][tl] , common.wrap(sl) , common.wrap(tl))); #} #} #}
def extract_freq_lexicon(canditates): cur_line = 0 lineno = 0 sl_tl = {} cur_sl_row = [] cur_tl_row = [] cur_bt_row = [] cur_al_row = [] with open(canditates) as infile: for line in infile: line = line.strip() lineno += 1 if lineno % 5000 == 0: sys.stderr.write('.') if lineno % 100000 == 0: sys.stderr.write(str(lineno) + '\n') sys.stderr.flush() try: if line[0] == '-': # Read the corpus, make a note of all ambiguous words, their frequency and their possible translations # # sl_tl[sl_word][tl_word] = tl_freq i = 0 for slword in cur_sl_row: if len(cur_bt_row[i]['tls']) > 1: for al in cur_al_row: if al == '': continue al_sl = int(al.split('-')[1]) al_tl = int(al.split('-')[0]) if al_sl != i: continue if al_tl < len(cur_tl_row): tlword = cur_tl_row[al_tl] else: tlword = cur_tl_row[-1] traceback.print_stack() print("alignment out", "of", "range", al_tl, "not in", "len(", cur_tl_row, ")", file=sys.stderr) exit(1) slword = slword if slword not in sl_tl: sl_tl[slword] = {} if tlword not in sl_tl[slword]: sl_tl[slword][tlword] = 0 sl_tl[slword][ tlword] = sl_tl[slword][tlword] + 1 # print '+' , slword , tlword , sl_tl[slword][tlword], lineno i = i + 1 cur_line = 0 continue line = line.split('\t')[1] if cur_line == 0: cur_sl_row = common.tokenise_tagger_line(line) elif cur_line == 1: cur_bt_row = common.tokenise_biltrans_line(line) elif cur_line == 2: cur_tl_row = common.tokenise_tagger_line(line) elif cur_line == 3: cur_al_row = line.split(' ') cur_line = cur_line + 1 except Exception: # print("Error in line", lineno, ":", e, file=sys.stderr) traceback.print_exc() exit(1) for sl in sl_tl: newtl = sorted(sl_tl[sl], key=lambda x: sl_tl[sl][x]) newtl.reverse() first = True for tl in newtl: if tl[0] == '*': print('Error: tl word unknown', tl, file=sys.stderr) continue first_tag_sl = sl.split('<')[1].split('>')[0].strip() first_tag_tl = tl.split('<')[1].split('>')[0].strip() if first_tag_sl != first_tag_tl: print('Error:', first_tag_sl, '!=', first_tag_tl, file=sys.stderr) continue if first: print(sl_tl[sl][tl], common.wrap(sl), common.wrap(tl), '@') first = False else: print(sl_tl[sl][tl], common.wrap(sl), common.wrap(tl))
def ngram_count_patterns(freq_lexicon, candidates, crisphold, max_rules): MAX_NGRAMS = 2 cur_line = 0 sl_tl_defaults = {} sl_tl = {} ngrams = {} lineno = 0 for line in open(freq_lexicon).readlines(): lineno += 1 if lineno % 10000 == 0: print(lineno, file=sys.stderr) if len(line) < 1: continue row = common.tokenise_tagger_line(line) sl = common.wrap(row[0]) tl = common.wrap(row[1]) if tl[1] == '*': tl = tl[:-3] + '$' if line.count('@') > 0: sl_tl_defaults[sl] = tl else: sl_tl[sl] = tl cur_sl_row = [] cur_tl_row = [] cur_bt_row = [] cur_al_row = [] lineno = 0 for line in open(candidates).readlines(): lineno += 1 line = line.strip() if lineno % 500 == 0: print(lineno, file=sys.stderr) if line[0] == '-': # print len(cur_sl_row), len(cur_tl_row), len(cur_bt_row), len(cur_al_row) # print cur_sl_row # print cur_bt_row # print cur_tl_row # print cur_al_row # # Read the corpus, make a note of all ambiguous words, their frequency and their possible translations # # sl_tl[sl_word][tl_word] = tl_freq i = 0 for slword in cur_sl_row: if len(cur_bt_row[i]['tls']) > 1: for al in cur_al_row: if al == '': continue al_sl = int(al.split('-')[1]) al_tl = int(al.split('-')[0]) if al_sl != i: continue tlword = common.wrap(cur_tl_row[al_tl]) slword = common.wrap(slword) if slword not in sl_tl_defaults: print('!', file=sys.stderr) continue for j in range(1, MAX_NGRAMS): pregram = ' '.join( map(common.wrap, cur_sl_row[i - j:i + 1])) postgram = ' '.join( map(common.wrap, cur_sl_row[i:i + j + 1])) roundgram = ' '.join( map(common.wrap, cur_sl_row[i - j:i + j + 1])) if slword not in ngrams: ngrams[slword] = {} if pregram not in ngrams[slword]: ngrams[slword][pregram] = {} if postgram not in ngrams[slword]: ngrams[slword][postgram] = {} if roundgram not in ngrams[slword]: ngrams[slword][roundgram] = {} if tlword not in ngrams[slword][pregram]: ngrams[slword][pregram][tlword] = 0 if tlword not in ngrams[slword][postgram]: ngrams[slword][postgram][tlword] = 0 if tlword not in ngrams[slword][roundgram]: ngrams[slword][roundgram][tlword] = 0 ngrams[slword][pregram][ tlword] = ngrams[slword][pregram][tlword] + 1 ngrams[slword][postgram][ tlword] = ngrams[slword][postgram][tlword] + 1 ngrams[slword][roundgram][ tlword] = ngrams[slword][roundgram][tlword] + 1 i = i + 1 cur_line = 0 # print line continue line = line.split('\t')[1] if cur_line == 0: cur_sl_row = common.tokenise_tagger_line(line) elif cur_line == 1: cur_bt_row = common.tokenise_biltrans_line(line) elif cur_line == 2: cur_tl_row = common.tokenise_tagger_line(line) elif cur_line == 3: cur_al_row = line.split(' ') cur_line = cur_line + 1 for sl in ngrams: for ngram in ngrams[sl]: total = 0 max_freq = -1 current_tl = '' newtl = sorted(ngrams[sl][ngram], key=lambda x: ngrams[sl][ngram][x]) newtl.reverse() newtl = newtl[:max_rules] for tl in newtl: if ngrams[sl][ngram][tl] > max_freq: max_freq = ngrams[sl][ngram][tl] current_tl = tl total = total + ngrams[sl][ngram][tl] # > If for each of the rules we include # > the amount of time the translation is seen with that pattern over the # > total, we get a number we can try as a threshold. e.g. > 0.6 >0.7 >0.8 # > etc. (>0.6 would be the same as 2/3 of the time the alternative # > translation is seen with that ngram, and 1/3 of the time the default # > translation is). I think this would be easier to explain than the magic # > number I came up with. # # I see this as a way to define how "crispy" the decisions are. I think it # would be better to express this as a ratio: the ratio of the times the # alternative translation is seen to the number of times the defaullt # translation is seen with that n-gram. # # It would be "2" in this case: the alternative is seen twice as often as # the default. for tl in newtl: crispiness = 0.0 default = sl_tl_defaults[sl] alt_crisp = float(ngrams[sl][ngram][tl]) / float(total) def_crisp = 1.0 if default in ngrams[sl][ngram]: def_crisp = float(ngrams[sl][ngram][default] / float(total)) weight = float(ngrams[sl][ngram][tl]) / float(total) crispiness = alt_crisp / def_crisp # print '%%%' , crispiness , alt_crisp , def_crisp , tl , default , ngrams[sl][ngram] if crispiness < crisphold: print( '-', crispiness, weight, total, max_freq, ngrams[sl][ngram][tl], '\t' + sl + '\t' + ngram + '\t' + tl + '\t' + str(ngrams[sl][ngram][tl])) else: print( '+', crispiness, weight, total, max_freq, ngrams[sl][ngram][tl], '\t' + sl + '\t' + ngram + '\t' + tl + '\t' + str(ngrams[sl][ngram][current_tl]))
indexes = {} trad_counter = {} am_counter = 0 dm_counter = 0 # First read in the frequency defaults for line in open(sys.argv[1]): #{ line = line.strip() if len(line) < 1: #{ continue #} row = common.tokenize_tagger_line(line) sl = common.wrap(row[0]) tl = common.wrap(row[1]) if tl[1] == '*': tl = tl[:-3] + '$' if sl not in trad_counter: #{ trad_counter[sl] = 0 #} if sl not in sl_tl: #{ sl_tl[sl] = [] #} if line.count('@') > 0: #{ sl_tl_defaults[sl] = tl sl_tl[sl].append(tl) indexes[(sl, tl)] = trad_counter[sl] trad_counter[sl] = trad_counter[sl] + 1 else: #{
def write_envelopes(self): stt = 50 # seconds-to-ticks converter # volume envelope volume_points = 0 volume_ticks = 0 volume_envelope = [] if 'ampeg_attack' not in self.options: volume_level = 0x40 else: volume_level = 0 vol_sustain_point = 0 volume_envelope.append(volume_ticks) if 'ampeg_delay' in self.options: volume_ticks += float(self.options['ampeg_delay']) * stt volume_points += 1 volume_level = 0 volume_envelope.append(volume_level) volume_envelope.append(volume_ticks) if 'ampeg_start' in self.options: volume_level = int(float(self.options['ampeg_start']) / 100 * stt) if 'ampeg_attack' in self.options: volume_ticks += int(float(self.options['ampeg_attack']) * stt) volume_envelope.append(volume_level) volume_points += 1 if 'ampeg_hold' in self.options: volume_ticks += int(float(self.options['ampeg_hold']) * stt) else: volume_level = 0x40 volume_envelope.append(volume_ticks) volume_envelope.append(volume_level) volume_points += 1 if 'ampeg_decay' in self.options: volume_ticks += int(float(self.options['ampeg_decay']) * stt) volume_envelope.append(volume_ticks) if 'ampeg_sustain' in self.options: volume_envelope.append(int(float(self.options['ampeg_sustain']) / 100 * stt)) else: volume_envelope.append(0) volume_points += 1 if 'ampeg_sustain' in self.options: volume_level = int(float(self.options['ampeg_sustain']) / 100 * stt) volume_envelope.append(volume_ticks) volume_envelope.append(volume_level) volume_points += 1 vol_sustain_point = volume_points - 1 if 'ampeg_release' in self.options: volume_ticks += int(float(self.options['ampeg_release']) * stt) volume_level = 0x0 volume_envelope.append(volume_ticks) volume_envelope.append(volume_level) volume_points += 1 if volume_ticks > 512: for i in range(len(volume_envelope) / 2): volume_envelope[2 * i] = int(volume_envelope[2 * i] * 512 / volume_ticks) if OPTIONS['verbose']: wrap("/"*10 + " Too long envelope: "+str(volume_ticks)+" ticks, shrinked to 512") self.output_file.write(struct.pack('{0}h'.format(2 * volume_points), *(volume_envelope))) self.output_file.write(struct.pack('{0}h'.format(2 * (12 - volume_points)), *(0 for i in range(2 * (12 - volume_points))))) self.output_file.write(struct.pack('24h', *(0 for i in range(24)))) # panning envelope self.output_file.write(struct.pack('b', volume_points)) self.output_file.write(struct.pack('b', 0)) self.output_file.write(struct.pack('b', vol_sustain_point)) self.output_file.write(struct.pack('5b', *(0 for i in range(5)))) volume_type = 0 if volume_points > 0: volume_type += 0b1 if vol_sustain_point > 0: volume_type += 0b10 self.output_file.write(struct.pack('b', volume_type)) self.output_file.write(struct.pack('b', 0)) # vibrato type/sweep/depth/rate self.output_file.write(struct.pack('4b', *(0 for i in range(4)))) # envelope data self.output_file.write(struct.pack('h', 0)) # volume fadeout self.output_file.write(struct.pack('22b', *(0 for i in range(22)))) # extended data
indexes = {} trad_counter = {} am_counter = 0 dm_counter = 0 # First read in the frequency defaults for line in open(sys.argv[1]): # { line = line.strip() if len(line) < 1: # { continue # } row = common.tokenize_tagger_line(line) sl = common.wrap(row[0]) tl = common.wrap(row[1]) if tl[1] == "*": tl = tl[:-3] + "$" if sl not in trad_counter: # { trad_counter[sl] = 0 # } if sl not in sl_tl: # { sl_tl[sl] = [] # } if line.count("@") > 0: # { sl_tl_defaults[sl] = tl sl_tl[sl].append(tl) indexes[(sl, tl)] = trad_counter[sl] trad_counter[sl] = trad_counter[sl] + 1 else: # {