def write_assembly_statistics(assembly, outdir): """ Write assembly statistics Parameters ---------- assembly : str Path to assembly fasta file outdir : str Path to the output directory Returns ------- """ assembly_lengths = [] assembly = open(assembly, mode='rt') # TODO: newline=None in Python3 fasta_iter = ( g for k, g in itertools_groupby(assembly, lambda x: x.startswith('>'))) for header in fasta_iter: # _ = header.__next__()[1:].rstrip('\r\n') # TODO: Python3 _ = next(header)[1:].rstrip('\r\n') # seq = ''.join(s.rstrip('\r\n') for s in fasta_iter.__next__()) # TODO: Python3 seq = ''.join(s.rstrip('\r\n') for s in next(fasta_iter)) assembly_lengths.append(len(seq)) with open(os.path.join(outdir, 'pilon_assembly_statistics.tab'), 'wt') as writer: writer.write('#' + '\t'.join(['contigs', 'bp']) + '\n') writer.write( '\t'.join(map(str, [len(assembly_lengths), sum(assembly_lengths)])) + '\n')
def kv_items(self): if self.key is not None: for k, v in itertools_groupby(self.sequence, key=self.key): yield k, self.val_postproc(map(self.val, v)) else: for i, v in enumerate(self.sequence): yield i, self.val(v)
def write_assembly_statistics(assembly, outdir): """ Write assembly statistics Parameters ---------- assembly : str Path to assembly fasta file outdir : str Path to the output directory Returns ------- """ assembly_lengths = [] assembly = open(assembly, mode='rt') # TODO: newline=None in Python3 fasta_iter = (g for k, g in itertools_groupby(assembly, lambda x: x.startswith('>'))) for header in fasta_iter: # _ = header.__next__()[1:].rstrip('\r\n') # TODO: Python3 _ = header.next()[1:].rstrip('\r\n') # seq = ''.join(s.rstrip('\r\n') for s in fasta_iter.__next__()) # TODO: Python3 seq = ''.join(s.rstrip('\r\n') for s in fasta_iter.next()) assembly_lengths.append(len(seq)) with open(os.path.join(outdir, 'pilon_assembly_statistics.tab'), 'wt') as writer: writer.write('#' + '\t'.join(['contigs', 'bp']) + '\n') writer.write('\t'.join(map(str, [len(assembly_lengths), sum(assembly_lengths)])) + '\n')
def get_sequence_information(fasta_file): headers = {} sequence_dict = {} headers_changed = False sequence_counter = 0 reader = open(fasta_file, mode='rt', newline=None) fasta_iter = ( g for k, g in itertools_groupby(reader, lambda x: x.startswith('>'))) for header in fasta_iter: original_header, new_header = clean_header( header.__next__()[1:].rstrip('\r\n')) if new_header in headers: sys.exit('Found duplicated sequence' ' headers: {original_header}'.format( original_header=original_header)) seq = ''.join(s.rstrip('\r\n') for s in fasta_iter.__next__()) sequence_counter += 1 sequence_dict[sequence_counter] = { 'header': new_header, 'sequence': seq, 'length': len(seq) } headers[new_header] = str(original_header) if new_header != original_header: headers_changed = True reader.close() return sequence_dict, headers, headers_changed
def clean_novel_alleles(novel_alleles, scheme_mlst, profile): """ Clean the fasta file with the novel alleles produced by mlst Parameters ---------- novel_alleles : str Path for fasta file containing the novel alleles scheme_mlst : str MLST schema found by mlst profile : list List of strings with the profile found Returns ------- """ unknown_genes = [] for gene_allele in profile: gene = gene_allele.split('(')[0] try: allele = gene_allele.split('(')[1].rstrip(')') if allele.startswith('~'): unknown_genes.append(gene) except IndexError as e: print('WARNING: {}'.format(e)) try: novel_alleles_keep = {} if len(unknown_genes) > 0: reader = open(novel_alleles, mode='rt') # TODO: newline=None in Python3 fasta_iter = (g for k, g in itertools_groupby(reader, lambda x: x.startswith('>'))) for header in fasta_iter: # header = header.__next__()[1:].rstrip('\r\n') # TODO: Python3 header = header.next()[1:].rstrip('\r\n') # seq = ''.join(s.rstrip('\r\n') for s in fasta_iter.__next__()) # TODO: Python3 seq = ''.join(s.rstrip('\r\n') for s in fasta_iter.next()) if header.startswith(scheme_mlst): gene = header.split('.')[1].split('~')[0] if gene in unknown_genes: novel_alleles_keep[header] = seq reader.close() os.remove(novel_alleles) if len(novel_alleles_keep) > 0: with open(novel_alleles, 'wt') as writer: for header, seq in novel_alleles_keep.items(): writer.write('>{}\n'.format(header)) writer.write('\n'.join(utils.chunkstring(seq, 80)) + '\n') except OSError as e: # TODO: FileNotFoundError in Python3 print('An unknown ST was found but no novel alleles fasta file was produced by mlst software:\n' '{}'.format(e))
def clean_novel_alleles(novel_alleles, scheme_mlst, profile): """ Clean the fasta file with the novel alleles produced by mlst Parameters ---------- novel_alleles : str Path for fasta file containing the novel alleles scheme_mlst : str MLST schema found by mlst profile : list List of strings with the profile found Returns ------- """ unknown_genes = [] for gene_allele in profile: gene = gene_allele.split('(')[0] try: allele = gene_allele.split('(')[1].rstrip(')') if allele.startswith('~'): unknown_genes.append(gene) except IndexError as e: print('WARNING: {}'.format(e)) novel_alleles_keep = {} if len(unknown_genes) > 0: reader = open(novel_alleles, mode='rt') # TODO: newline=None in Python3 fasta_iter = ( g for k, g in itertools_groupby(reader, lambda x: x.startswith('>'))) for header in fasta_iter: # header = header.__next__()[1:].rstrip('\r\n') # TODO: Python3 header = header.next()[1:].rstrip('\r\n') # seq = ''.join(s.rstrip('\r\n') for s in fasta_iter.__next__()) # TODO: Python3 seq = ''.join(s.rstrip('\r\n') for s in fasta_iter.next()) if header.startswith(scheme_mlst): gene = header.split('.')[1].split('~')[0] if gene in unknown_genes: novel_alleles_keep[header] = seq reader.close() os.remove(novel_alleles) if len(novel_alleles_keep) > 0: with open(novel_alleles, 'wt') as writer: for header, seq in novel_alleles_keep.items(): writer.write('>{}\n'.format(header)) writer.write('\n'.join(utils.chunkstring(seq, 80)) + '\n')
def groupby(inset, keyfunc): """groupby on unsorted inset""" return itertools_groupby(sorted(inset, key=keyfunc), keyfunc)
css=osp.relpath(collection_css, path) ) with open(path, 'wt', encoding='utf-8') as fp: fp.write(render(collection_template, view)) # write out an all collections file with open(osp.join(collection_path, 'ALL'), 'wt', encoding='utf-8') as fp: fp.write(' '.join(all_collections)) # write the index.htmls idxtemplate = open("src/book-index.mako").read() idxpaths = sorted(set(b["path"] for b in ndx)) start = osp.join(CONTENT, "index.html") back = start i = 1 for path, group in itertools_groupby(ndx, lambda v: v["path"]): view = dict( name="index", books=group, back=osp.relpath(back, osp.dirname(path)), next=osp.relpath(idxpaths[i] if i < len(idxpaths) else start, osp.dirname(path)), css=osp.relpath(osp.join(OUT, "index.css"), path), ) with open(path, "wt", encoding="utf-8") as fp: fp.write(render(idxtemplate, view)) back = path i += 1 # write the word indexes WOUT = osp.join(CONTENT, "index")