def dict_score_prons(dict_, candidate_prons): # adds "score" information to candidate pronunciations. If a candidate # pronunciation matches a dictionary entry, it's "right." If it matches # except for stress, it's "partial." Otherwise, it's "wrong." # # wsj/s5/local/dict/score_prons.pl word_and_pron = set() word_and_pron_nostress = set() num_pattern = re.compile(r"\d") for entry in cat(dict_): word, pron = entry.split(maxsplit=1) pron_nostress = num_pattern.sub("", pron) word_and_pron.add(word + ";" + pron) word_and_pron_nostress.add(word + ";" + pron_nostress) if isinstance(candidate_prons, str): candidate_prons = cat(candidate_prons) for line in candidate_prons: word, pron = line.split(";", 2)[:2] pron_nostress = num_pattern.sub("", pron) if (word + ";" + pron) in word_and_pron: score = ";right" elif (word + ";" + pron_nostress) in word_and_pron_nostress: score = ";partial" else: score = ";wrong" yield line + score
def __init__(self, key: str, img: List[str]): self.key = key self.img = img self.stored_ops: List[int] = [] # Store edge as ids to speed up later matches. Convert to binary, read clockwise def edge_to_int(s: str) -> int: return int(str_replace(s, ['.', '#'], ['1', '0']), 2) top, bottom = img[0], img[-1][::-1] right, left = cat([l[-1] for l in img]), cat([l[0] for l in img])[::-1] self.edges = [edge_to_int(side) for side in [left, top, right, bottom]]
def solve_part_two(day_input: List[str]) -> int: def mask_addresses(mask: str) -> Iterator[int]: """Returns all the possible addresses by substituting X by 0 and 1""" if mask.find('X') == -1: yield int(mask, 2) else: yield from mask_addresses(mask.replace('X', '0', 1)) yield from mask_addresses(mask.replace('X', '1', 1)) mem = dict() for line in day_input: op, arg = line.split(' = ') if op == 'mask': # Safeguard, don't accept more than 2^10 addresses to write if (arg.count('X') > 10): raise ValueError(f"An input with more than 2^10 possibilities was encountered: \n{line}") mask = arg elif op.startswith('mem'): # Strategy is to: # 1. create the address mask doing a manual *or* # 2. get all possible addresses and set them addr = f"{int(op[4:-1]):036b}" new_mask = [a if m == '0' else m for m, a in zip(mask, addr)] for m in mask_addresses(cat(new_mask)): mem[m] = int(arg) return sum(mem.values())
def dict_limit_candidate_prons(rule_hierarchy, candidate_prons): # This takes the candidate pronunciations from dict_get_candidate_prons # and the rule hierarchy from dict_get_rule_hierarchy and limits the # candidate pronunciations to those that use the most specific rules # # see wsj/s5/local/dict/limit_candidate_prons.pl etc etc... hierarchy = set(cat(rule_hierarchy)) def process_word(cur_lines): pair2rule_list = dict() for line in cur_lines: baseword, basepron = line.split(";", 4)[2:4] key = baseword + ";" + basepron pair2rule_list.setdefault(key, []).append(line) for lines in pair2rule_list.values(): stress, rules = [], [] for line in lines: rulename, destress = line.split(";")[4:6] stress.append(destress) rules.append(rulename) for m in range(len(lines)): ok = True for n in range(len(lines)): if m == n or stress[m] != stress[n]: continue if (rules[n] + ";" + rules[m]) in hierarchy: ok = False break if ok: yield lines[m] if isinstance(candidate_prons, str): candidate_prons = cat(candidate_prons) cur_word = None cur_lines = [] for line in candidate_prons: word = line.split(";", 1)[0] if cur_word is not None and cur_word != word: for x in process_word(cur_lines): yield x cur_lines.clear() cur_lines.append(line) cur_word = word for x in process_word(cur_lines): yield x
def valid(name, checksum): "Determine if name is valid according to checksum." counts = Counter(name.replace('-', '')) print(counts) # Note: counts.most_common(5) doesn't work because it breaks ties arbitrarily. letters = sorted(counts, key=lambda L: (-counts[L], L)) print(letters) return checksum == cat(letters[:5])
def img_after_ops(img: List[str], ops: List[int]) -> List[str]: """Apply rotation and flip *ops* to image *img* returning the result""" new_img = img[:] for op in ops: if op == Tile.ROTATE: new_img = [cat(l)[::-1] for l in zip(*new_img)] elif op == Tile.FLIP: new_img = [l[::-1] for l in new_img] return new_img
def wsj_prepare_char_dict(data_root, src_dict_suffix, dst_dict_suffix): phone_dir = os.path.join(data_root, "local", "dict" + src_dict_suffix) dir_ = os.path.join(data_root, "local", "dict" + dst_dict_suffix) mkdir(dir_) lexicon1_raw_nosil_txt = os.path.join(phone_dir, "lexicon1_raw_nosil.txt") phn_lexicon2_raw_nosil_txt = os.path.join(phone_dir, "lexicon2_raw_nosil.txt") unique = OrderedDict() for entry in cat(lexicon1_raw_nosil_txt): unique.setdefault(entry.split(" ")[0], entry) pipe_to(unique.values(), phn_lexicon2_raw_nosil_txt) char_lexicon2_raw_nosil_txt = os.path.join(dir_, "lexicon2_raw_nosil.txt") bad_chars = set('!~@#$%^&*()+=/",;:?_{}-') pipe_to( (" ".join([x] + [y for y in x if y not in bad_chars]) for x in unique.keys()), char_lexicon2_raw_nosil_txt, ) del unique pipe_to(["SIL", "SPN", "NSN"], os.path.join(dir_, "silence_phones.txt")) pipe_to(["SIL"], os.path.join(dir_, "optional_silence.txt")) pipe_to( sort( set( cat( ["!SIL SIL", "<SPOKEN_NOISE> SPN", "<NOISE> NSN"], char_lexicon2_raw_nosil_txt, ))), os.path.join(dir_, "lexicon.txt"), ) pipe_to( sort( set( cat(*(x.split(" ")[1:] for x in cat(char_lexicon2_raw_nosil_txt))))), os.path.join(dir_, "nonsilence_phones.txt"), )
def display(board: List[Position]): """Displays a board""" boundaries = get_boundaries(board) for w in range(*boundaries['w']): for z in range(*boundaries['z']): print(f'z={z}, w={w}') for y in range(*boundaries['y']): line = ['.'] * (boundaries['x'][1] - boundaries['x'][0]) for p in board: if p.z == z and p.y == y: line[p.x - boundaries['x'][0]] = '#' print(cat(line))
def wsj_train_lms(data_root, src_dict_suffix, out_dir="local_lm", max_order=4): # Train a language model on WSJ lm training corpus # Here we don't do things the Kaldi way. Kaldi uses its own # derivative-based language modeling. We'll do modified Kneser-Ney # smoothing, which is a little more widespread. src_dict_dir = os.path.join(data_root, "local", "dict" + src_dict_suffix + "_larger") dst_lm_dir = os.path.join(data_root, "local", out_dir) mkdir(dst_lm_dir) vocab = set(x.split()[0] for x in cat(os.path.join(src_dict_dir, "lexicon.txt"))) vocab.remove("!SIL") pipe_to(sorted(vocab), os.path.join(dst_lm_dir, "wordlist.txt")) with gzip.open(os.path.join(src_dict_dir, "cleaned.gz"), "rt") as f: text = f.read() sents = ngram_lm.text_to_sents(text, sent_end_expr=r"\n", word_delim_expr=r" +") del text ngram_counts = ngram_lm.sents_to_ngram_counts(sents, max_order) ngram_counts[0]["<UNK>"] = 0 # add to vocab # find any ngrams that contain words that aren't part of the vocabulary. # we'll prune them. By pruning them we mean completely removing them. # Modified Kneser-Ney can use the frequency statistics before removing # them to_prune = set(ngram_counts[0]) - vocab to_prune.remove("<S>") for i, ngram_count in enumerate(ngram_counts[1:]): if i: to_prune.update(x for x in ngram_count if x[:-1] in to_prune or x[-1] in to_prune) else: to_prune.update(x for x in ngram_count if x[0] in to_prune or x[-1] in to_prune) prob_list = ngram_lm.ngram_counts_to_prob_list_kneser_ney( ngram_counts, sos="<S>", to_prune=to_prune) del ngram_counts lm = ngram_lm.BackoffNGramLM(prob_list, sos="<S>", eos="</S>", unk="<UNK>") # "pruning" here means removing the probability mass of the sos token and # redistributing to the other unigrams. "<S>" will remain in the # vocabulary lm.prune_by_name({"<S>"}) del prob_list print("Corpus PPL:", lm.corpus_perplexity(sents))
def solve_part_one(day_input: List[str]) -> str: lst = [int(i) for i in day_input[0]] M, m = max(lst), min(lst) for _ in range(100): val = lst[0] - 1 if lst[0] > m else M while val in lst[1:4]: val = val - 1 if val > m else M idx = lst.index(val) lst = lst[4:idx+1] + lst[1:4] + lst[idx+1:] + lst[0:1] idx = lst.index(1) return cat(str(i) for i in lst[idx + 1:] + lst[0:idx])
def dict_score_rules(counted_rules, partial_score=0.8, ballast=1, destress_penalty=1e-5): # weigh the counted rules to derive a score for each rule # # wsj/s5/local/dict/score_rules.pl if isinstance(counted_rules, str): counted_rules = cat(counted_rules) for counted_rule in counted_rules: rule, destress, right, partial, wrong = counted_rule.split(";") rule_score = int(right) + int(partial) * partial_score rule_score /= int(right) + int(partial) + int(wrong) + ballast if destress == "yes": rule_score -= destress_penalty yield "{};{};{:.5f}".format(rule, destress, rule_score)
def dict_select_candidate_prons(candidates, max_prons=4, min_rule_score=0.35): # for a given word, sort its pronunciations by score and return up to # max_prons of the top-scored candidates, subject to the constraint # that all returned candidates have a score >= min_rule_score # # wsj/s5/local/dict/select_candidates.pl # # AFAICT Perl sort-keys-by-value introduces non-determinism in the order # of pronunciations of the same score. Unfortunately, this determines the # ultimate candidate pronunciations of some OOV words. # We pre-empt a bit of that non-determinism, but don't expect perfect # matching values with Kaldi. def process_word(cur_lines): pron2rule_score = dict() pron2line = dict() for line in cur_lines: word, pron, _, _, _, _, score = line.split(";") score = float(score) if score >= min_rule_score and score > pron2rule_score.get( pron, -1): pron2rule_score[pron] = score pron2line[pron] = line prons = sorted(pron2rule_score, key=lambda x: (-pron2rule_score[x], x), reverse=False) for pron, _ in zip(prons, range(max_prons)): yield pron2line[pron] cur_lines = [] cur_word = None if isinstance(candidates, str): candidates = cat(candidates) for candidate in candidates: word, _ = candidate.split(";", 1) if word != cur_word: for line in process_word(cur_lines): yield line cur_word, cur_lines = word, [] cur_lines.append(candidate) for line in process_word(cur_lines): yield line
def dict_reverse_candidates(candidates): # reverse prefix/suffixes in candidate pronunciation list # # wsj/s5/local/dict/reverse_candidates.pl def reverse_str(x): return x[::-1] def reverse_pron(x): return " ".join(x.split(" ")[::-1]) if isinstance(candidates, str): candidates = cat(candidates) for candidate in candidates: word, pron, baseword, basepron, rule, rest = candidate.split(";", 5) word, pron = reverse_str(word), reverse_pron(pron) baseword, basepron = reverse_str(baseword), reverse_pron(basepron) r_suff, r_bsuff, r_pron, r_bpron = rule.split(",") r_suff, r_bsuff = reverse_str(r_suff), reverse_str(r_bsuff) r_pron, r_bpron = reverse_pron(r_pron), reverse_pron(r_bpron) rule = ",".join((r_suff, r_bsuff, r_pron, r_bpron)) yield ";".join((word, pron, baseword, basepron, rule, rest))
def dict_count_rules(scored_prons): # count the number of times a rule, stress pair was scored right, partial # or wrong in dict_score_prons # # wsj/s5/local/dict/count_rules.pl counts = dict() if isinstance(scored_prons, str): scored_prons = cat(scored_prons) for scored_pron in scored_prons: rulename, destress, score = scored_pron.split(";")[4:] ref = counts.setdefault(rulename + ";" + destress, [0, 0, 0]) if score == "right": ref[0] += 1 elif score == "partial": ref[1] += 1 elif score == "wrong": ref[2] += 1 else: raise ValueError("Bad score") for key, value in counts.items(): yield ";".join([key] + [str(x) for x in value])
def dict_get_rule_hierarchy(rules_path): # this function looks at the pairs of suffixes from dict_get_rules. # Whenever those spelling/pronunciation suffixes share a non-empty prefix, # it implies that that rule could be generalized to one that excludes that # shared prefix. For (a real) example: ("TICS", ["S"]) and ("TIC", []) # are a paired rule. Recall that this says you can generate a new word by # removing "TICS" from the word and adding "TIC", and removing ["S"] from # its pronunciation and adding [] (or vice versa). Because the suffixes # share a prefix "TIC", a more general rule might exclude the "TIC" part: # ("S", ["S"]), ("", []). If this more general rule is also included in the # set of rules, then the output stores the specific -> general relationship # # See wsj/s5/local/dict/get_rule_hierarchy.pl for a more formal description rules = set(cat(rules_path)) for rule in rules: A = rule.split(",") suffix_a, suffix_b = tuple(A[0]), tuple(A[1]) psuffix_a, psuffix_b = A[2].split(), A[3].split() common_suffix_len = 0 while common_suffix_len < min(len(suffix_a), len(suffix_b)): if suffix_a[common_suffix_len] != suffix_b[common_suffix_len]: break common_suffix_len += 1 common_psuffix_len = 0 while common_psuffix_len < min(len(psuffix_a), len(psuffix_b)): if psuffix_a[common_psuffix_len] != psuffix_b[common_psuffix_len]: break common_psuffix_len += 1 for m in range(common_suffix_len + 1): sa, sb = "".join(suffix_a[m:]), "".join(suffix_b[m:]) for n in range(common_psuffix_len + 1): if not n and not m: continue psa, psb = " ".join(psuffix_a[n:]), " ".join(psuffix_b[n:]) more_general_rule = ",".join((sa, sb, psa, psb)) if more_general_rule in rules: yield ";".join((rule, more_general_rule))
elif cmd.startswith('rotate row'): screen[A, :] = rotate(screen[A,:], B) elif cmd.startswith('rotate col'): screen[:,A] = rotate(screen[:, A], B) def rotate(items, n): return np.append(items[-n:], items[:-n]) def Screen(): return np.zeros((6, 50), dtype = np.int) def run(commands, screen): for cmd in commands: interpret(cmd,screen) return screen screen = run(Input(8), Screen()) print(np.sum(screen)) for row in screen: # print(cat(' @'[pixel] for pixel in row)) # print([' @'[pixel] for pixel in row]) # print([[pixel] for pixel in row]) print(cat( ' @' if pixel == 1 else ' ' for pixel in row)) # a = np.zeros((6,6), dtype=int) # b = [1,2, 3] # c = [' @'[m] for m in b] # print(c)
from common import transpose, first, groupby, cat import unittest if __name__ == '__main__': assert tuple(transpose(((1, 2, 3), (4, 5, 6)))) == ((1, 4), (2, 5), (3, 6)) assert first('abc') == first(['a', 'b', 'c']) == 'a' assert cat(['a', 'b', 'c']) == 'abc' assert (groupby(['test', 'one', 'two', 'three', 'four'], key=len) == { 3: ['one', 'two'], 4: ['test', 'four'], 5: ['three'] }) unittest.main()
def parallel_exonerate(ex, subfiles, dbFile, grp_descr, minScore=77, n=1, x=22, l=30, q=True, via="local"): futures = [fastqToFasta.nonblocking(ex,sf,n=n,x=x,via=via) for sf in subfiles] futures2 = [] res = [] resExonerate = [] faSubFiles = [] all_ambiguous = [] all_ambiguous_fastq = [] all_unaligned = [] all_discarded = [] gid, grp_name = grp_descr my_minscore = _get_minscore(dbFile) for sf in futures: subResFile = unique_filename_in() faSubFiles.append(sf.wait()) futures2.append(exonerate.nonblocking(ex, faSubFiles[-1], dbFile, minScore=my_minscore, via=via, stdout=subResFile, memory=6)) resExonerate.append(subResFile) for nf,f in enumerate(resExonerate): futures2[nf].wait() (resSplitExonerate,alignments) = split_exonerate(f,minScore,l=l,n=n) all_unaligned.append(alignments["unaligned"]) all_ambiguous.append(alignments["ambiguous"]) all_ambiguous_fastq.append(alignments["ambiguous_fastq"]) all_discarded.append(alignments["discarded"]) res.append(resSplitExonerate) gzipfile(ex,cat(all_unaligned[1:],out=all_unaligned[0])) ex.add(all_unaligned[0]+".gz", description=set_file_descr(grp_name+"_unaligned.txt.gz", groupId=gid,step="exonerate",type="txt", view="admin", comment="scores between %i and %i"%(my_minscore,minScore)) ) # add ambiguous file only if it is not empty n = count_lines(ex,all_ambiguous[0]) if n > 1: gzipfile(ex,cat(all_ambiguous[1:],out=all_ambiguous[0])) ex.add(all_ambiguous[0]+".gz", description=set_file_descr(grp_name+"_ambiguous.txt.gz", groupId=gid,step="exonerate",type="txt", view="admin", comment="multiple equally good classifications") ) # add ambiguous fastq file only if it is not empty tot_ambiguous = count_lines(ex,all_ambiguous_fastq[0])/4 if n > 1: gzipfile(ex,cat(all_ambiguous_fastq[1:],out=all_ambiguous_fastq[0])) ex.add(all_ambiguous_fastq[0]+".gz", description=set_file_descr(grp_name+"_ambiguous.fastq.gz", groupId=gid,step="exonerate",type="fastq", comment="multiple equally good classifications") ) # add discarded file only if it is not empty tot_discarded = count_lines(ex,all_discarded[0])/4 if n > 1: gzipfile(ex,cat(all_discarded[1:],out=all_discarded[0])) ex.add(all_discarded[0]+".gz", description=set_file_descr(grp_name+"_discarded.fastq.gz", groupId=gid, step="exonerate", type="fastq", view="admin", comment="< %i bps" %l ) ) gzipfile(ex,faSubFiles[0]) ex.add(faSubFiles[0]+".gz", description=set_file_descr(grp_name+"_input_part.fa.gz", groupId=gid, step="init", type="fa", view="admin", comment="part") ) gzipfile(ex,resExonerate[0]) ex.add(resExonerate[0]+".gz", description=set_file_descr(grp_name+"_exonerate_part.txt.gz", groupId=gid, step="exonerate", type="txt", view="admin", comment="part") ) resFiles = dict((k,'') for d in res for k in d.keys()) for k in resFiles.keys(): v = [d[k] for d in res if k in d] resFiles[k] = cat(v[1:],out=v[0]) return (resFiles, tot_ambiguous, tot_discarded)
def demultiplex_workflow(ex, job, gl, file_path="../", via='lsf', logfile=sys.stdout, debugfile=sys.stderr): script_path = gl['script_path'] file_names = {} job_groups = job.groups resFiles = {} for gid, group in job_groups.iteritems(): file_names[gid] = {} primersFilename = 'group_' + group['name'] + "_barcode_file.fa" primersFile = group.get("primersfile", os.path.join(file_path, primersFilename)) ex.add(primersFile, description=set_file_descr(primersFilename, groupId=gid, step="init", type="fa")) paramsFilename = 'group_' + group['name'] + "_param_file.txt" paramsFile = group.get("paramsfile", os.path.join(file_path, paramsFilename)) ex.add(paramsFile, description=set_file_descr(paramsFilename, groupId=gid, step="init", type="txt")) params = load_paramsFile(paramsFile) infiles = [] tot_counts = 0 allSubFiles = [] for rid, run in group['runs'].iteritems(): infiles.append(run) n = count_lines(ex, run) tot_counts += n / 4 if n > 10000000: allSubFiles.extend(split_file(ex, run, n_lines=8000000)) else: allSubFiles.append(run) (resExonerate, tot_ambiguous, tot_discarded) = parallel_exonerate(ex, allSubFiles, primersFile, (gid, group['name']), via=via, **params) gzipfile(ex, cat(infiles)) ex.add(run + ".gz", description=set_file_descr(group['name'] + "_full_fastq.gz", groupId=gid, step='exonerate', view='debug', type="fastq")) logfile.write("Will get sequences to filter\n") logfile.flush() seqToFilter = getSeqToFilter(ex, primersFile) logfile.write("Will filter the sequences\n") filteredFastq = filterSeq(ex, resExonerate, seqToFilter, gid, group['name'], via=via) logfile.write("After filterSeq, filteredFastq=%s\n" % filteredFastq) logfile.flush() counts_primers = {} counts_primers_filtered = {} global bcDelimiter if len(filteredFastq): archive = unique_filename_in() tgz = tarfile.open(archive, "w:gz") for k, f in resExonerate.iteritems(): counts_primers[k] = count_lines(ex, f) / 4 if k in filteredFastq: k2 = k.replace(bcDelimiter, "_") file_names[gid][k2] = group['name'] + "_" + k2 + "_filtered" ex.add(filteredFastq[k], description=set_file_descr(file_names[gid][k2] + ".fastq", groupId=gid, step="final", type="fastq")) counts_primers_filtered[k] = count_lines(ex, filteredFastq[k]) / 4 tgz.add(f, arcname=group['name'] + "_" + k.replace(bcDelimiter, "_") + ".fastq") else: k2 = k.replace(bcDelimiter, "_") file_names[gid][k2] = group['name'] + "_" + k2 ex.add(f, description=set_file_descr(file_names[gid][k2] + ".fastq", groupId=gid, step="final", type="fastq")) counts_primers_filtered[k] = 0 if len(filteredFastq): tgz.close() ex.add(archive, description=set_file_descr(group['name'] + "_unfiltered_fastq.tgz", groupId=gid, step="exonerate", type="tar")) # Prepare report per group of runs report_ok, reportFile = prepareReport(ex, group['name'], tot_counts, counts_primers, counts_primers_filtered, tot_ambiguous, tot_discarded) ex.add(reportFile, description=set_file_descr(group['name'] + "_report_demultiplexing.txt", groupId=gid, step="final", type="txt", view="admin")) if report_ok: reportFile_pdf = unique_filename_in() createReport(ex, reportFile, reportFile_pdf, script_path) ex.add(reportFile_pdf, description=set_file_descr(group['name'] + "_report_demultiplexing.pdf", groupId=gid, step="final", type="pdf")) else: logfile.write( "*** Probable ambiguous classification: total_reads < sum(reads_by_primers) ***\n" ) logfile.flush() add_pickle( ex, file_names, set_file_descr('file_names', step="final", type='py', view='admin')) return resFiles
def wsj_prepare_dict(data_root, dict_suffix=""): dir_ = os.path.join(data_root, "local", "dict" + dict_suffix) cmudict = os.path.join(dir_, "cmudict") mkdir(cmudict) # we use the github URL mentioned here # http://www.speech.cs.cmu.edu/cgi-bin/cmudict # to avoid using subversion. url = ("https://raw.githubusercontent.com/Alexir/CMUdict/" "7a37de79f7e650fd6334333b1b5d2bcf0dee8ad3/") for x in {"cmudict.0.7a", "cmudict-0.7b.symbols"}: # 0.7b.symbols the same as 0.7a.symbols path = os.path.join(cmudict, x) if not os.path.exists(path): request.urlretrieve(url + x, path) silence_phones_txt = os.path.join(dir_, "silence_phones.txt") optional_silence_txt = os.path.join(dir_, "optional_silence.txt") pipe_to(["SIL", "SPN", "NSN"], silence_phones_txt) pipe_to(["SIL"], optional_silence_txt) nonsilence_phones_txt = os.path.join(dir_, "nonsilence_phones.txt") phone_pattern = re.compile(r"^(\D+)\d*$") phones_of = dict() for phone in cat(os.path.join(cmudict, "cmudict-0.7b.symbols")): match = phone_pattern.match(phone) if not match: raise ValueError("Bad phone {}".format(phone)) base = match.groups(1) # no stress phones_of.setdefault(base, []).append(phone) pipe_to((" ".join(x) for x in phones_of.values()), nonsilence_phones_txt) del phones_of # skip extra_questions.txt # there were a few updates to 0.7.a that make the resulting lexicon # slightly different from Kaldi's lexicon_raw_nosil_txt = os.path.join(dir_, "lexicon1_raw_nosil.txt") entry_pattern = re.compile(r"^(\S+)\(\d+\) (.*)$") lexicon_raw_nosil_lines = [] for line in cat(os.path.join(cmudict, "cmudict.0.7a")): if line.startswith(";;;"): continue match = entry_pattern.match(line) if match is None: lexicon_raw_nosil_lines.append(line) else: lexicon_raw_nosil_lines.append(" ".join( [match.group(1), match.group(2)])) pipe_to(lexicon_raw_nosil_lines, lexicon_raw_nosil_txt) del lexicon_raw_nosil_lines lexicon_txt = os.path.join(dir_, "lexicon.txt") pipe_to( sort( set( cat( [ "!SIL SIL", "<SPOKEN_NOISE> SPN", "<UNK> SPN", "<NOISE> NSN" ], lexicon_raw_nosil_txt, ))), lexicon_txt, )
def wsj_extend_dict(dir_13_32_1, data_root, src_dict_suffix, mincount=2): src_dict_dir = os.path.join(data_root, "local", "dict" + src_dict_suffix) dst_dict_dir = os.path.join(data_root, "local", "dict" + src_dict_suffix + "_larger") if os.path.isdir(dst_dict_dir): rmtree(dst_dict_dir) copytree(src_dict_dir, dst_dict_dir) # lexicon1_raw_nosil.txt is an unsorted (?) version of dict.cmu dict_cmu = os.path.join(dst_dict_dir, "dict.cmu") pipe_to( sort(set(cat(os.path.join(src_dict_dir, "lexicon1_raw_nosil.txt")))), dict_cmu) pipe_to( sort(set(x.split()[0] for x in cat(dict_cmu))), os.path.join(dst_dict_dir, "wordlist.cmu"), ) cleaned_gz = os.path.join(dst_dict_dir, "cleaned.gz") train_data_root = os.path.join(dir_13_32_1, "wsj1", "doc", "lng_modl", "lm_train", "np_data") assert os.path.isdir(train_data_root) train_data_files = [] for subdir in ("87", "88", "89"): train_data_files.extend( glob(os.path.join(train_data_root, subdir), r"*.z")) isword = set(x.strip() for x in cat(os.path.join(dst_dict_dir, "wordlist.cmu"))) with gzip.open(cleaned_gz, "wt") as out: for train_data_file in train_data_files: with open(train_data_file, "rb") as in_: compressed = in_.read() decompressed = unlzw(compressed) in_ = io.TextIOWrapper(io.BytesIO(decompressed)) for line in in_: if line.startswith("<"): continue A = line.strip().upper().split(" ") for n, a in enumerate(A): if a not in isword and len(a) > 1 and a.endswith("."): out.write(a[:-1]) if n < len(A) - 1: out.write("\n") else: out.write(a + " ") out.write("\n") del in_, compressed, decompressed counts = Counter() with gzip.open(cleaned_gz, "rt") as cleaned: for line in cleaned: for token in line.strip().split(): counts[token] += 1 counts = sorted(((v, k) for (k, v) in counts.items()), reverse=True) digits = set(str(x) for x in range(10)) oov_counts_path = os.path.join(dst_dict_dir, "oov.counts") oovlist_path = os.path.join(dst_dict_dir, "oovlist") with open(os.path.join(dst_dict_dir, "unigrams"), "w") as unigrams, open(oov_counts_path, "w") as oov_cnts, open(oovlist_path, "w") as oov_lst: for count, word in counts: line = "{} {}\n".format(count, word) unigrams.write(line) if word not in isword: oov_cnts.write(line) if not (set(word) & digits) and count >= mincount: oov_lst.write(word + "\n") del counts dict_acronyms_path = os.path.join(dst_dict_dir, "dict.acronyms") pipe_to( dict_get_acronym_prons(oovlist_path, dict_cmu), dict_acronyms_path, ) f_dir = os.path.join(dst_dict_dir, "f") b_dir = os.path.join(dst_dict_dir, "b") mkdir(f_dir, b_dir) banned = set(",;") pipe_to( (x for x in cat(dict_cmu) if not (set(x.split()[0]) & banned)), os.path.join(f_dir, "dict"), ) pipe_to( (x for x in cat(os.path.join(dst_dict_dir, "oovlist")) if not (set(x.split()[0]) & banned)), os.path.join(f_dir, "oovs"), ) pipe_to( (" ".join([w[::-1]] + p.split()[::-1]) for (w, p) in (x.split(" ", 1) for x in cat(os.path.join(f_dir, "dict")))), os.path.join(b_dir, "dict"), ) pipe_to((x[::-1] for x in cat(os.path.join(f_dir, "oovs"))), os.path.join(b_dir, "oovs")) for dir_ in (f_dir, b_dir): dict_path = os.path.join(dir_, "dict") rules_path = os.path.join(dir_, "rules") hierarchy_path = os.path.join(dir_, "hierarchy") oovs_path = os.path.join(dir_, "oovs") rule_counts_path = os.path.join(dir_, "rule.counts") rules_with_scores_path = os.path.join(dir_, "rules.with_scores") oov_candidates_path = os.path.join(dir_, "oovs.candidates") pipe_to(dict_get_rules(cat(dict_path)), rules_path) pipe_to(dict_get_rule_hierarchy(rules_path), hierarchy_path) pipe_to( dict_count_rules( dict_score_prons( dict_path, dict_limit_candidate_prons( hierarchy_path, dict_get_candidate_prons(rules_path, dict_path, dict_path), ), )), rule_counts_path, ) pipe_to( sorted( dict_score_rules(rule_counts_path), key=lambda x: (float(x.split(";")[2]), x), reverse=True, ), rules_with_scores_path, ) pipe_to( dict_limit_candidate_prons( hierarchy_path, dict_get_candidate_prons(rules_with_scores_path, dict_path, oovs_path), ), oov_candidates_path, ) oov_candidates_path = os.path.join(dst_dict_dir, "oovs.candidates") pipe_to( sorted( cat( dict_reverse_candidates(os.path.join(b_dir, "oovs.candidates")), os.path.join(f_dir, "oovs.candidates"), )), oov_candidates_path, ) dict_oovs_path = os.path.join(dst_dict_dir, "dict.oovs") pipe_to( ("{0} {1}".format(*x.split(";")) for x in dict_select_candidate_prons(oov_candidates_path)), dict_oovs_path, ) dict_oovs_merged_path = os.path.join(dst_dict_dir, "dict.oovs_merged") pipe_to( sorted(set(cat(dict_acronyms_path, dict_oovs_path))), dict_oovs_merged_path, ) pipe_to( sorted( set( cat( [ "!SIL SIL", "<SPOKEN_NOISE> SPN", "<UNK> SPN", "<NOISE> NSN" ], dict_cmu, dict_oovs_merged_path, ))), os.path.join(dst_dict_dir, "lexicon.txt"), )
def dict_get_acronym_prons(oovlist, dict_): # this function first extracts single-letter acronyms from the CMU dict. # It then looks for words in the oovlist file that look like acronyms, and # builds up some pronunciations based on them # # consult Kaldi's wsj/s5/local/dict/get_acronym_prons.pl for more details def get_letter_prons(letters, letter_prons): acronym = list(letters) prons = [""] while acronym: letter = acronym.pop(0) n = 1 while acronym and acronym[0] == letter: acronym.pop(0) n += 1 letter_pron = letter_prons[letter] prons_of_block = [] if n == 2: for lpron in letter_pron: prons_of_block.append("D AH1 B AH0 L " + lpron) prons_of_block.append(lpron + " " + lpron) elif n == 3: for lpron in letter_pron: prons_of_block.append("T R IH1 P AH0 L " + lpron) prons_of_block.append(" ".join([lpron] * 3)) else: for lpron in letter_pron: prons_of_block.append(" ".join([lpron] * n)) new_prons = [] for pron in prons: for pron_of_block in prons_of_block: if pron: new_prons.append(pron + " " + pron_of_block) else: new_prons.append(pron_of_block) prons = new_prons assert prons[0] != "" for pron in prons: yield pron if isinstance(dict_, str): dict_ = cat(dict_) letter_pattern = re.compile(r"^[A-Z]\.$") letter_prons = dict() for line in dict_: word, pron = line.strip().split(" ", maxsplit=1) if letter_pattern.match(word): letter = word[0] letter_prons.setdefault(letter, []).append(pron.strip()) if isinstance(oovlist, str): oovlist = cat(oovlist) acro_wo_points_pattern = re.compile(r"^[A-Z]{1,5}$") acro_w_points_pattern = re.compile(r"^([A-Z]\.){1,4}[A-Z]\.?$") for word in oovlist: word = word.strip() if acro_wo_points_pattern.match(word): for pron in get_letter_prons(word, letter_prons): yield word + " " + pron elif acro_w_points_pattern.match(word): for pron in get_letter_prons(word.replace(".", ""), letter_prons): yield word + " " + pron
def parallel_exonerate(ex, subfiles, dbFile, grp_descr, minScore=77, n=1, x=22, l=30, trim=True, via="local"): futures = [ fastqToFasta.nonblocking(ex, sf, n=n, x=x, via=via) for sf in subfiles ] futures2 = [] res = [] resExonerate = [] faSubFiles = [] all_ambiguous = [] all_ambiguous_fastq = [] all_unaligned = [] all_discarded = [] gid, grp_name = grp_descr my_minscore = _get_minscore(dbFile) primersDict = get_primersList(dbFile) for sf in futures: subResFile = unique_filename_in() faSubFiles.append(sf.wait()) futures2.append( exonerate.nonblocking(ex, faSubFiles[-1], dbFile, minScore=my_minscore, via=via, stdout=subResFile, memory=6)) resExonerate.append(subResFile) for nf, f in enumerate(resExonerate): futures2[nf].wait() (resSplitExonerate, alignments) = split_exonerate(f, primersDict, minScore, l=l, n=n, trim=trim) all_unaligned.append(alignments["unaligned"]) all_ambiguous.append(alignments["ambiguous"]) all_ambiguous_fastq.append(alignments["ambiguous_fastq"]) all_discarded.append(alignments["discarded"]) res.append(resSplitExonerate) # add unaligned file only if it is not empty n = count_lines(ex, all_unaligned[0]) if n > 1: catfile = cat(all_unaligned) gzipfile(ex, catfile) ex.add(catfile + ".gz", description=set_file_descr(grp_name + "_unaligned.txt.gz", groupId=gid, step="exonerate", type="txt", view="admin", comment="scores between %i and %i" % (my_minscore, minScore))) # add ambiguous file only if it is not empty n = count_lines(ex, all_ambiguous[0]) if n > 1: catfile = cat(all_ambiguous) gzipfile(ex, catfile) ex.add(catfile + ".gz", description=set_file_descr( grp_name + "_ambiguous.txt.gz", groupId=gid, step="exonerate", type="txt", view="admin", comment="multiple equally good classifications")) # add ambiguous fastq file only if it is not empty tot_ambiguous = count_lines(ex, all_ambiguous_fastq[0]) / 4 if tot_ambiguous > 1: catfile = cat(all_ambiguous_fastq) gzipfile(ex, catfile) ex.add(catfile + ".gz", description=set_file_descr( grp_name + "_ambiguous.fastq.gz", groupId=gid, step="exonerate", type="fastq", comment="multiple equally good classifications")) # add discarded file only if it is not empty tot_discarded = count_lines(ex, all_discarded[0]) / 4 if tot_discarded > 1: catfile = cat(all_discarded) gzipfile(ex, catfile) ex.add(catfile + ".gz", description=set_file_descr(grp_name + "_discarded.fastq.gz", groupId=gid, step="exonerate", type="fastq", view="admin", comment="remaining seq too short")) ## add part input fasta file only if it is not empty n = count_lines(ex, faSubFiles[0]) if n > 1: gzipfile(ex, faSubFiles[0]) ex.add(faSubFiles[0] + ".gz", description=set_file_descr(grp_name + "_input_part.fa.gz", groupId=gid, step="init", type="fa", view="admin", comment="part")) ## add part res exonerate only if it is not empty n = count_lines(ex, resExonerate[0]) if n > 1: gzipfile(ex, resExonerate[0]) ex.add(resExonerate[0] + ".gz", description=set_file_descr(grp_name + "_exonerate_part.txt.gz", groupId=gid, step="exonerate", type="txt", view="admin", comment="part")) resFiles = dict((k, '') for d in res for k in d.keys()) for k in resFiles.keys(): v = [d[k] for d in res if k in d] resFiles[k] = cat(v[1:], out=v[0]) return (resFiles, tot_ambiguous, tot_discarded)
if k in fastqFiles: indexFiles[k] = bowtie_build.nonblocking(ex,f,via=via) unalignedFiles = {} futures = [] bwtarg = ["-a","-q","-n","2","-l","20","--un"] for k,f in indexFiles.iteritems(): unalignedFiles[k] = unique_filename_in() touch(ex,unalignedFiles[k]) futures.append(bowtie.nonblocking( ex, f.wait(), fastqFiles[k], bwtarg+[unalignedFiles[k]], via='lsf')) for f in futures: f.wait() return unalignedFiles if __name__ == "__main__": primersDict=get_primersList(sys.argv[2]) (resSplitExonerate,alignments)=split_exonerate(sys.argv[1],primersDict,minScore=8,l=30,n=1,trim='False') res = [] res.append(resSplitExonerate) resFiles = dict((k,'') for d in res for k in d.keys()) for k in resFiles.keys(): v = [d[k] for d in res if k in d] resFiles[k] = cat(v[1:],out=v[0]) print resFiles print resSplitExonerate print alignments print "Done"
from common import Input, transpose, cat from collections import Counter if __name__ == '__main__': data = Input(6).read().split() t_data = [col for col in transpose(data)] couters = [Counter(m) for m in t_data] frequenc_items = [m.most_common(1)[0][0] for m in couters] print(cat(frequenc_items))
def demultiplex_workflow(ex, job, gl, file_path="../", via='lsf', logfile=sys.stdout, debugfile=sys.stderr): script_path = gl['script_path'] file_names = {} job_groups=job.groups resFiles={} for gid, group in job_groups.iteritems(): file_names[gid] = {} primersFilename = 'group_' + group['name'] + "_barcode_file.fa" primersFile = group.get("primersfile",os.path.join(file_path,primersFilename)) ex.add(primersFile,description=set_file_descr(primersFilename,groupId=gid,step="init",type="fa")) paramsFilename = 'group_' + group['name'] + "_param_file.txt" paramsFile = group.get("paramsfile",os.path.join(file_path,paramsFilename)) ex.add(paramsFile,description=set_file_descr(paramsFilename,groupId=gid,step="init",type="txt")) params = load_paramsFile(paramsFile) infiles = [] tot_counts = 0 allSubFiles = [] for rid,run in group['runs'].iteritems(): infiles.append(run) n=count_lines(ex,run) tot_counts += n/4 if n>10000000: allSubFiles.extend(split_file(ex,run,n_lines=8000000)) else: allSubFiles.append(run) (resExonerate, tot_ambiguous, tot_discarded) = parallel_exonerate( ex, allSubFiles, primersFile, (gid, group['name']), via=via, **params ) gzipfile(ex, cat(infiles)) ex.add(run+".gz",description=set_file_descr(group['name']+"_full_fastq.gz", groupId=gid,step='exonerate',view='debug',type="fastq")) logfile.write("Will get sequences to filter\n");logfile.flush() seqToFilter = getSeqToFilter(ex,primersFile) logfile.write("Will filter the sequences\n") filteredFastq = filterSeq(ex,resExonerate,seqToFilter,gid,group['name'],via=via) logfile.write("After filterSeq, filteredFastq=%s\n" %filteredFastq);logfile.flush() counts_primers = {} counts_primers_filtered = {} global bcDelimiter if len(filteredFastq): archive = unique_filename_in() tgz = tarfile.open(archive, "w:gz") for k,f in resExonerate.iteritems(): counts_primers[k] = count_lines(ex,f)/4 if k in filteredFastq: k2 = k.replace(bcDelimiter,"_") file_names[gid][k2] = group['name']+"_"+k2+"_filtered" ex.add(filteredFastq[k],description=set_file_descr(file_names[gid][k2]+".fastq", groupId=gid,step="final", type="fastq")) counts_primers_filtered[k] = count_lines(ex,filteredFastq[k])/4 tgz.add( f, arcname=group['name']+"_"+k.replace(bcDelimiter,"_")+".fastq" ) else: k2 = k.replace(bcDelimiter,"_") file_names[gid][k2] = group['name']+"_"+k2 ex.add(f,description=set_file_descr(file_names[gid][k2]+".fastq", groupId=gid,step="final", type="fastq")) counts_primers_filtered[k] = 0 if len(filteredFastq): tgz.close() ex.add(archive,description=set_file_descr(group['name']+"_unfiltered_fastq.tgz", groupId=gid,step="exonerate", type="tar")) # Prepare report per group of runs report_ok,reportFile = prepareReport(ex,group['name'], tot_counts, counts_primers,counts_primers_filtered, tot_ambiguous, tot_discarded) ex.add(reportFile,description = set_file_descr( group['name']+"_report_demultiplexing.txt", groupId=gid,step="final",type="txt",view="admin")) if report_ok: reportFile_pdf = unique_filename_in() createReport(ex,reportFile,reportFile_pdf,script_path) ex.add(reportFile_pdf,description=set_file_descr( group['name']+"_report_demultiplexing.pdf", groupId=gid,step="final",type="pdf")) else: logfile.write("*** Probable ambiguous classification: total_reads < sum(reads_by_primers) ***\n");logfile.flush() add_pickle( ex, file_names, set_file_descr('file_names',step="final",type='py',view='admin') ) return resFiles
futures.append( bowtie.nonblocking(ex, f.wait(), fastqFiles[k], bwtarg + [unalignedFiles[k]], via='lsf')) for f in futures: f.wait() return unalignedFiles if __name__ == "__main__": primersDict = get_primersList(sys.argv[2]) (resSplitExonerate, alignments) = split_exonerate(sys.argv[1], primersDict, minScore=8, l=30, n=1, trim='False') res = [] res.append(resSplitExonerate) resFiles = dict((k, '') for d in res for k in d.keys()) for k in resFiles.keys(): v = [d[k] for d in res if k in d] resFiles[k] = cat(v[1:], out=v[0]) print resFiles print resSplitExonerate print alignments print "Done"
def dict_get_candidate_prons(rules, dict_, words, min_prefix_len=3): # the purpose of this script is to apply the rules from dict_get_rules to # the word list, given the dictionary. It does no pruning based on # hierarchy. We're basically removing one suffix and attaching another. # # see wsj/s5/local/dict/get_candidate_prons.pl for a more formal descript. isrule = set() suffix2rule = dict() rule_and_stress_to_rule_score = dict() for rule in cat(rules): rule = rule.split(";", 3) rule_score = rule.pop() if len(rule) == 3 else -1 destress = rule.pop() if len(rule) == 2 else None assert len(rule) == 1 rule = rule[0] R = rule.split(",") if len(R) != 4: raise ValueError("Bad rule {}".format(rule)) suffix = R[0] if rule not in isrule: isrule.add(rule) suffix2rule.setdefault(suffix, []).append(R) if destress is None: rule_and_stress_to_rule_score[rule + ";yes"] = rule_score rule_and_stress_to_rule_score[rule + ";no"] = rule_score else: rule_and_stress_to_rule_score[rule + ";" + destress] = rule_score word2prons = dict() for entry in cat(dict_): word, pron = entry.split(maxsplit=1) word2prons.setdefault(word, []).append(pron) prefixcount = Counter( chain.from_iterable( (w[:p] for p in range(len(w) + 1)) for w in word2prons)) if isinstance(words, str): words = cat(words) for word in words: word = word.split(maxsplit=1)[0] ownword = 1 if word in word2prons else 0 for prefix_len in range(min_prefix_len, len(word) + 1): prefix, suffix = word[:prefix_len], word[prefix_len:] if prefixcount.get(prefix, 0) - ownword == 0: continue rules_array_ref = suffix2rule.get(suffix, None) if rules_array_ref is None: continue for R in rules_array_ref: _, base_suffix, psuffix, base_psuffix = R base_word = prefix + base_suffix base_prons_ref = word2prons.get(base_word, None) if base_prons_ref is None: continue if base_psuffix: base_psuffix = " " + base_psuffix # FIXME(sdrobert): I think this might split up phones. This'll # be bad when some phones share prefixes with others. for base_pron in base_prons_ref: base_pron_prefix_len = len(base_pron) - len(base_psuffix) if (base_pron_prefix_len < 0 or base_pron[base_pron_prefix_len:] != base_psuffix): continue pron_prefix = base_pron[:base_pron_prefix_len] rule = ",".join(R) for destress in range(2): if destress: pron_prefix = pron_prefix.replace("2", "1") destress_mark = "yes" else: destress_mark = "no" pron = pron_prefix if psuffix: pron += " " + psuffix rule_score = rule_and_stress_to_rule_score.get( rule + ";" + destress_mark, None) if rule_score is None: continue output = [ word, pron, base_word, base_pron, rule, destress_mark ] if rule_score != -1: output.append(str(rule_score)) yield ";".join(output)