예제 #1
0
    def __init__(self, file):
        self.open(file)

        self.regions = []
        self.group = {}
        self.last_chunk = False
        self.curr = -1
        self.in_region = -1
        self.in_group = False

        line = self.read()
        while len(line) > 0:
            self.parse_line(line)
            line = self.read()

        # complete samples info
        for i, region in enumerate(self.regions):
            region.validate()
            lo = notes.index(region['lokey'])
            hi = notes.index(region['hikey'])
            region['notes'] = range(lo, hi + 1)

        self.collect_samples()

        used_regions = []
        unused_regions = []
        for i, region in enumerate(self.regions):
            if self.is_region_used(i):
                region.load_audio()
                region['delta_sample'] = tempdir + str(time.clock()) + '.dat'
                region['sample_length'] = len(region['sample_data']) * region['channels']
                region['sample_data'].T.flatten().tofile(region['delta_sample'], format='f')
                region['sample_data'] = ''
                del region['sample_data']
                used_regions.append(region)
            else:
                unused_regions.append(i)

        self.regions = used_regions

        if unused_regions and OPTIONS['verbose']:
            wrap("/"*10 + ' Notice: some samples are not used, skipping:')
            wrap(", ".join([str(i+1) for i in unused_regions]))

        self.options = {}
        for region in self.regions:
            self.options.update(region)

        self.collect_samples(do_print=False)
예제 #2
0
 def process_row(self, frac_count=0):
     if self.tokenizer == 'regex':
         cur_sl_row = [strip_tags(s, 'sl', space=True) for s in self.am_row]
         for i in range(len(self.am_row)):
             if self.am_row[i].count('/') > 1:
                 sl = strip_tags(self.am_row[i], 'sl')
                 tl = strip_tags(self.dm_row[i], 'tl')
                 self.process_lu_internal(sl, tl, i, cur_sl_row, frac_count)
     elif self.tokenizer == 'biltrans':
         cur_sl_row = [x['sl'] for x in self.am_row]
         for i in range(len(self.am_row)):
             if len(self.am_row[i]['tls']) > 1:
                 sl = self.am_row[i]['sl']
                 tl = self.dm_row[i]['tls'][0]
                 if self.biltrans_wrap_lus:
                     sl = common.wrap(sl)
                     tl = common.wrap(tl)
                 self.process_lu_internal(sl, tl, i, cur_sl_row, frac_count)
예제 #3
0
def biltrans_extract_frac_freq(biltrans_ambig, biltrans_annotated):

    c = Counter()
    c.read_files(
        biltrans_ambig,  # File with ambiguous biltrans output
        biltrans_annotated)  # File with disambiguated biltrans output

    for sl in c.sl_tl:
        newtl = sorted(c.sl_tl[sl], key=lambda x: c.sl_tl[sl][x])
        newtl.reverse()
        first = True
        for tl in newtl:
            if first:
                print('%.10f %s %s @' %
                      (c.sl_tl[sl][tl], common.wrap(sl), common.wrap(tl)))
                first = False
            else:
                print('%.10f %s %s' %
                      (c.sl_tl[sl][tl], common.wrap(sl), common.wrap(tl)))
예제 #4
0
    def collect_samples(self, do_print=True):
        self.overlapping = []
        self.ignored = []

        for i, region in enumerate(self.regions):
            for note in region['notes']:
                if note < len(self.notes_samples) and note > -1:
                    if self.notes_samples[note] != -1:
                        self.overlapping.append(notes[note])
                    self.notes_samples[note] = i
                else:
                    self.ignored.append(notes[note])

        if do_print and OPTIONS['verbose']:
            if len(self.overlapping) > 0:
                wrap("/"*10 + " Notice: some regions are overlapping and would be overwritten")
                wrap(", ".join(self.overlapping))
            if len(self.ignored) > 0:
                wrap("/"*10 + " Notice: some notes are out of range and ignored")
                wrap(", ".join(set(self.ignored)))
sl_tl = defaultdict(list)

features = {}  # features[(slword, ['a', 'list'], tlword)] = 3

indexes = {}
trad_counter = defaultdict(lambda: 0)

# First read in the frequency defaults

for line in open(sys.argv[1]):
    line = line.strip()
    if len(line) < 1:
        continue

    row = common.tokenize_tagger_line(line)
    sl = common.wrap(row[0])
    tl = common.wrap(row[1])
    if tl[1] == '*':
        tl = tl[:-3] + '$'

    indexes[(sl, tl)] = trad_counter[sl]
    trad_counter[sl] += 1
    sl_tl[sl].append(tl)

    if line.count('@') > 0:
        sl_tl_defaults[sl] = tl


class Counter(BCC.BiltransCounter):
    tokenizer = 'biltrans'
    line_ids = True
					sl_tl[sl] = {};
				#}
				if tl not in sl_tl[sl]: #{
					sl_tl[sl][tl] = 0.0;
				#}
				sl_tl[sl][tl] = sl_tl[sl][tl] + frac_count;
		
			#}
		#}
		dm_line = dm_file.readline();
		if dm_line == '': break;
		current_dm_line_id = int(dm_line.split('.[][')[1].split(' ')[0]);

	#}	
#}


for sl in sl_tl: #{
	newtl = sorted(sl_tl[sl], key=lambda x: sl_tl[sl][x])
	newtl.reverse()
	first = True;
	for tl in newtl: #{
		if first: #{
			print('%.10f %s %s @' % (sl_tl[sl][tl] , common.wrap(sl) , common.wrap(tl)));
			first = False
		else: #{
			print('%.10f %s %s' % (sl_tl[sl][tl] , common.wrap(sl) , common.wrap(tl)));
		#}
	#}
#}
예제 #7
0
def extract_freq_lexicon(canditates):

    cur_line = 0
    lineno = 0
    sl_tl = {}

    cur_sl_row = []
    cur_tl_row = []
    cur_bt_row = []
    cur_al_row = []

    with open(canditates) as infile:
        for line in infile:
            line = line.strip()
            lineno += 1
            if lineno % 5000 == 0:
                sys.stderr.write('.')
                if lineno % 100000 == 0:
                    sys.stderr.write(str(lineno) + '\n')

                sys.stderr.flush()

            try:
                if line[0] == '-':
                    # Read the corpus, make a note of all ambiguous words, their frequency and their possible translations
                    #
                    # sl_tl[sl_word][tl_word] = tl_freq
                    i = 0
                    for slword in cur_sl_row:
                        if len(cur_bt_row[i]['tls']) > 1:
                            for al in cur_al_row:
                                if al == '':
                                    continue
                                al_sl = int(al.split('-')[1])
                                al_tl = int(al.split('-')[0])
                                if al_sl != i:
                                    continue

                                if al_tl < len(cur_tl_row):
                                    tlword = cur_tl_row[al_tl]
                                else:
                                    tlword = cur_tl_row[-1]
                                    traceback.print_stack()
                                    print("alignment out",
                                          "of",
                                          "range",
                                          al_tl,
                                          "not in",
                                          "len(",
                                          cur_tl_row,
                                          ")",
                                          file=sys.stderr)
                                    exit(1)
                                slword = slword
                                if slword not in sl_tl:
                                    sl_tl[slword] = {}

                                if tlword not in sl_tl[slword]:
                                    sl_tl[slword][tlword] = 0

                                sl_tl[slword][
                                    tlword] = sl_tl[slword][tlword] + 1
                                # print '+' , slword , tlword , sl_tl[slword][tlword], lineno

                        i = i + 1

                    cur_line = 0
                    continue

                line = line.split('\t')[1]

                if cur_line == 0:
                    cur_sl_row = common.tokenise_tagger_line(line)
                elif cur_line == 1:
                    cur_bt_row = common.tokenise_biltrans_line(line)
                elif cur_line == 2:
                    cur_tl_row = common.tokenise_tagger_line(line)
                elif cur_line == 3:
                    cur_al_row = line.split(' ')

                cur_line = cur_line + 1
            except Exception:
                # print("Error in line", lineno, ":", e, file=sys.stderr)
                traceback.print_exc()
                exit(1)

    for sl in sl_tl:

        newtl = sorted(sl_tl[sl], key=lambda x: sl_tl[sl][x])
        newtl.reverse()
        first = True
        for tl in newtl:
            if tl[0] == '*':
                print('Error: tl word unknown', tl, file=sys.stderr)
                continue

            first_tag_sl = sl.split('<')[1].split('>')[0].strip()
            first_tag_tl = tl.split('<')[1].split('>')[0].strip()
            if first_tag_sl != first_tag_tl:
                print('Error:',
                      first_tag_sl,
                      '!=',
                      first_tag_tl,
                      file=sys.stderr)
                continue

            if first:
                print(sl_tl[sl][tl], common.wrap(sl), common.wrap(tl), '@')
                first = False
            else:
                print(sl_tl[sl][tl], common.wrap(sl), common.wrap(tl))
예제 #8
0
def ngram_count_patterns(freq_lexicon, candidates, crisphold, max_rules):
    MAX_NGRAMS = 2
    cur_line = 0

    sl_tl_defaults = {}
    sl_tl = {}
    ngrams = {}

    lineno = 0
    for line in open(freq_lexicon).readlines():
        lineno += 1
        if lineno % 10000 == 0:
            print(lineno, file=sys.stderr)
        if len(line) < 1:
            continue

        row = common.tokenise_tagger_line(line)
        sl = common.wrap(row[0])
        tl = common.wrap(row[1])
        if tl[1] == '*':
            tl = tl[:-3] + '$'
        if line.count('@') > 0:
            sl_tl_defaults[sl] = tl
        else:
            sl_tl[sl] = tl

    cur_sl_row = []
    cur_tl_row = []
    cur_bt_row = []
    cur_al_row = []
    lineno = 0
    for line in open(candidates).readlines():
        lineno += 1
        line = line.strip()
        if lineno % 500 == 0:
            print(lineno, file=sys.stderr)
        if line[0] == '-':
            #		print len(cur_sl_row), len(cur_tl_row), len(cur_bt_row), len(cur_al_row)
            #		print cur_sl_row
            #		print cur_bt_row
            #		print cur_tl_row
            #		print cur_al_row
            #
            # Read the corpus, make a note of all ambiguous words, their frequency and their possible translations
            #
            # sl_tl[sl_word][tl_word] = tl_freq
            i = 0
            for slword in cur_sl_row:
                if len(cur_bt_row[i]['tls']) > 1:
                    for al in cur_al_row:
                        if al == '':
                            continue
                        al_sl = int(al.split('-')[1])
                        al_tl = int(al.split('-')[0])
                        if al_sl != i:
                            continue

                        tlword = common.wrap(cur_tl_row[al_tl])
                        slword = common.wrap(slword)

                        if slword not in sl_tl_defaults:
                            print('!', file=sys.stderr)
                            continue

                        for j in range(1, MAX_NGRAMS):
                            pregram = ' '.join(
                                map(common.wrap, cur_sl_row[i - j:i + 1]))
                            postgram = ' '.join(
                                map(common.wrap, cur_sl_row[i:i + j + 1]))
                            roundgram = ' '.join(
                                map(common.wrap, cur_sl_row[i - j:i + j + 1]))

                            if slword not in ngrams:
                                ngrams[slword] = {}

                            if pregram not in ngrams[slword]:
                                ngrams[slword][pregram] = {}

                            if postgram not in ngrams[slword]:
                                ngrams[slword][postgram] = {}

                            if roundgram not in ngrams[slword]:
                                ngrams[slword][roundgram] = {}

                            if tlword not in ngrams[slword][pregram]:
                                ngrams[slword][pregram][tlword] = 0

                            if tlword not in ngrams[slword][postgram]:
                                ngrams[slword][postgram][tlword] = 0

                            if tlword not in ngrams[slword][roundgram]:
                                ngrams[slword][roundgram][tlword] = 0

                            ngrams[slword][pregram][
                                tlword] = ngrams[slword][pregram][tlword] + 1
                            ngrams[slword][postgram][
                                tlword] = ngrams[slword][postgram][tlword] + 1
                            ngrams[slword][roundgram][
                                tlword] = ngrams[slword][roundgram][tlword] + 1

                i = i + 1

            cur_line = 0
            # print line
            continue

        line = line.split('\t')[1]

        if cur_line == 0:
            cur_sl_row = common.tokenise_tagger_line(line)
        elif cur_line == 1:
            cur_bt_row = common.tokenise_biltrans_line(line)
        elif cur_line == 2:
            cur_tl_row = common.tokenise_tagger_line(line)
        elif cur_line == 3:
            cur_al_row = line.split(' ')

        cur_line = cur_line + 1

    for sl in ngrams:
        for ngram in ngrams[sl]:
            total = 0
            max_freq = -1
            current_tl = ''
            newtl = sorted(ngrams[sl][ngram],
                           key=lambda x: ngrams[sl][ngram][x])
            newtl.reverse()
            newtl = newtl[:max_rules]
            for tl in newtl:
                if ngrams[sl][ngram][tl] > max_freq:
                    max_freq = ngrams[sl][ngram][tl]
                    current_tl = tl

                total = total + ngrams[sl][ngram][tl]

            # > If for each of the rules we include
            # > the amount of time the translation is seen with that pattern over the
            # > total, we get a number we can try as a threshold. e.g. > 0.6 >0.7 >0.8
            # > etc.  (>0.6 would be the same as 2/3 of the time the alternative
            # > translation is seen with that ngram, and 1/3 of the time the default
            # > translation is). I think this would be easier to explain than the magic
            # > number I came up with.
            #
            # I see this as a way to define how "crispy" the decisions are. I think it
            # would be better to express this as a ratio: the ratio of the times the
            # alternative translation is seen to the number of times the defaullt
            # translation is seen with that n-gram.
            #
            # It would be "2" in this case: the alternative is seen twice as often as
            # the default.

            for tl in newtl:
                crispiness = 0.0
                default = sl_tl_defaults[sl]
                alt_crisp = float(ngrams[sl][ngram][tl]) / float(total)
                def_crisp = 1.0
                if default in ngrams[sl][ngram]:
                    def_crisp = float(ngrams[sl][ngram][default] /
                                      float(total))

                weight = float(ngrams[sl][ngram][tl]) / float(total)
                crispiness = alt_crisp / def_crisp

                # print '%%%' , crispiness , alt_crisp , def_crisp , tl , default , ngrams[sl][ngram]

                if crispiness < crisphold:
                    print(
                        '-', crispiness, weight, total, max_freq,
                        ngrams[sl][ngram][tl], '\t' + sl + '\t' + ngram +
                        '\t' + tl + '\t' + str(ngrams[sl][ngram][tl]))
                else:

                    print(
                        '+', crispiness, weight, total, max_freq,
                        ngrams[sl][ngram][tl], '\t' + sl + '\t' + ngram +
                        '\t' + tl + '\t' + str(ngrams[sl][ngram][current_tl]))
indexes = {}
trad_counter = {}

am_counter = 0
dm_counter = 0

# First read in the frequency defaults

for line in open(sys.argv[1]):  #{
    line = line.strip()
    if len(line) < 1:  #{
        continue
    #}
    row = common.tokenize_tagger_line(line)
    sl = common.wrap(row[0])
    tl = common.wrap(row[1])
    if tl[1] == '*':
        tl = tl[:-3] + '$'
    if sl not in trad_counter:  #{
        trad_counter[sl] = 0
    #}
    if sl not in sl_tl:  #{
        sl_tl[sl] = []
    #}
    if line.count('@') > 0:  #{
        sl_tl_defaults[sl] = tl
        sl_tl[sl].append(tl)
        indexes[(sl, tl)] = trad_counter[sl]
        trad_counter[sl] = trad_counter[sl] + 1
    else:  #{
					sl_tl[sl] = {};
				#}
				if tl not in sl_tl[sl]: #{
					sl_tl[sl][tl] = 0.0;
				#}
				sl_tl[sl][tl] = sl_tl[sl][tl] + frac_count;

			#}
		#}
		dm_line = dm_file.readline();
		if dm_line == '': break;
		current_dm_line_id = int(dm_line.split('.[][')[1].split(' ')[0]);

	#}
#}


for sl in sl_tl: #{
	newtl = sorted(sl_tl[sl], key=lambda x: sl_tl[sl][x])
	newtl.reverse()
	first = True;
	for tl in newtl: #{
		if first: #{
			print('%.10f %s %s @' % (sl_tl[sl][tl] , common.wrap(sl) , common.wrap(tl)));
			first = False
		else: #{
			print('%.10f %s %s' % (sl_tl[sl][tl] , common.wrap(sl) , common.wrap(tl)));
		#}
	#}
#}
예제 #11
0
    def write_envelopes(self):
        stt = 50  # seconds-to-ticks converter
        # volume envelope
        volume_points = 0
        volume_ticks = 0
        volume_envelope = []
        if 'ampeg_attack' not in self.options:
            volume_level = 0x40
        else:
            volume_level = 0
        vol_sustain_point = 0

        volume_envelope.append(volume_ticks)
        if 'ampeg_delay' in self.options:
            volume_ticks += float(self.options['ampeg_delay']) * stt
            volume_points += 1
            volume_level = 0

            volume_envelope.append(volume_level)
            volume_envelope.append(volume_ticks)

        if 'ampeg_start' in self.options:
            volume_level = int(float(self.options['ampeg_start']) / 100 * stt)

        if 'ampeg_attack' in self.options:
            volume_ticks += int(float(self.options['ampeg_attack']) * stt)

        volume_envelope.append(volume_level)
        volume_points += 1

        if 'ampeg_hold' in self.options:
            volume_ticks += int(float(self.options['ampeg_hold']) * stt)
        else:
            volume_level = 0x40
        volume_envelope.append(volume_ticks)
        volume_envelope.append(volume_level)
        volume_points += 1

        if 'ampeg_decay' in self.options:
            volume_ticks += int(float(self.options['ampeg_decay']) * stt)
            volume_envelope.append(volume_ticks)

            if 'ampeg_sustain' in self.options:
                volume_envelope.append(int(float(self.options['ampeg_sustain']) / 100 * stt))
            else:
                volume_envelope.append(0)

            volume_points += 1

        if 'ampeg_sustain' in self.options:
            volume_level = int(float(self.options['ampeg_sustain']) / 100 * stt)
            volume_envelope.append(volume_ticks)
            volume_envelope.append(volume_level)
            volume_points += 1
            vol_sustain_point = volume_points - 1

        if 'ampeg_release' in self.options:
            volume_ticks += int(float(self.options['ampeg_release']) * stt)
            volume_level = 0x0
            volume_envelope.append(volume_ticks)
            volume_envelope.append(volume_level)
            volume_points += 1

        if volume_ticks > 512:
            for i in range(len(volume_envelope) / 2):
                volume_envelope[2 * i] = int(volume_envelope[2 * i] * 512 / volume_ticks)
            if OPTIONS['verbose']:
                wrap("/"*10 + " Too long envelope: "+str(volume_ticks)+" ticks, shrinked to 512")

        self.output_file.write(struct.pack('{0}h'.format(2 * volume_points), *(volume_envelope)))
        self.output_file.write(struct.pack('{0}h'.format(2 * (12 - volume_points)),
                               *(0 for i in range(2 * (12 - volume_points)))))

        self.output_file.write(struct.pack('24h', *(0 for i in range(24))))  # panning envelope

        self.output_file.write(struct.pack('b', volume_points))
        self.output_file.write(struct.pack('b', 0))

        self.output_file.write(struct.pack('b', vol_sustain_point))

        self.output_file.write(struct.pack('5b', *(0 for i in range(5))))

        volume_type = 0
        if volume_points > 0:
            volume_type += 0b1
        if vol_sustain_point > 0:
            volume_type += 0b10

        self.output_file.write(struct.pack('b', volume_type))
        self.output_file.write(struct.pack('b', 0))

        # vibrato type/sweep/depth/rate
        self.output_file.write(struct.pack('4b', *(0 for i in range(4))))

        # envelope data

        self.output_file.write(struct.pack('h', 0))  # volume fadeout
        self.output_file.write(struct.pack('22b', *(0 for i in range(22))))  # extended data
indexes = {}
trad_counter = {}

am_counter = 0
dm_counter = 0


# First read in the frequency defaults

for line in open(sys.argv[1]):  # {
    line = line.strip()
    if len(line) < 1:  # {
        continue
        # }
    row = common.tokenize_tagger_line(line)
    sl = common.wrap(row[0])
    tl = common.wrap(row[1])
    if tl[1] == "*":
        tl = tl[:-3] + "$"
    if sl not in trad_counter:  # {
        trad_counter[sl] = 0
        # }
    if sl not in sl_tl:  # {
        sl_tl[sl] = []
        # }
    if line.count("@") > 0:  # {
        sl_tl_defaults[sl] = tl
        sl_tl[sl].append(tl)
        indexes[(sl, tl)] = trad_counter[sl]
        trad_counter[sl] = trad_counter[sl] + 1
    else:  # {