def parse_readsRC(reads_file): """ parse reads file and return both the sequence and its reverse complement """ COMP = dict( zip(['A', 'T', 'G', 'C', 'N', '-'], ['T', 'A', 'C', 'G', 'N', '-'])) for line in reads_file: if line.startswith('>'): file_type = 'fasta' break file_type = 'fastq' break reads_file.seek(0) if file_type == 'fasta': for seq in parse_fasta(reads_file): # return sequence and reverse complement seq = seq[1].upper() yield seq yield ''.join([COMP[base] for base in seq[::-1]]) else: for seq in parse_fastq(reads_file): # return sequence and reverse complement seq = seq[1] yield seq yield ''.join([COMP[base] for base in seq[::-1]])
def parse_genomes(fastas, single): """ generator for parsing fastas if single is True, combine sequences in multifasta file """ if single is True: for genome in fastas: sequence = [] for seq in parse_fasta(genome): sequence.extend(list(seq[1].upper())) yield (genome.name.rsplit('.', 1)[0], len(sequence), sequence) else: for genome in fastas: for seq in parse_fasta(genome): ID = seq[0].split('>', 1)[1].split()[0] yield (ID, len(seq[1]), list(seq[1].upper()))
def scaffold_hits(searches, fasta, max_hits): """ get hits from each search against each RP scaffolds[scaffold] = # ORfs s2rp[scaffold] = {rp:[hits]} """ # initialize ## scaffolds[scaffold] = # ORFs scaffolds = {} for seq in parse_fasta(fasta): scaffold = seq[0].split()[0].split('>', 1)[1].rsplit('_', 1)[0] if scaffold not in scaffolds: scaffolds[scaffold] = 0 scaffolds[scaffold] += 1 s2rp = {s: {r[0]: [] for r in searches} for s in scaffolds} # get hits from blast for search in searches: rp, blast = search hits = [ i for i in numblast(open(blast), max_hits, evalue_thresh, bit_thresh) ] for hit in hits: s = hit[0].split()[0].rsplit('_', 1)[0] hit[10], hit[11] = float(hit[10]), float(hit[11]) s2rp[s][rp].append(hit) return scaffolds, s2rp
def strip_inserts(fasta): """ remove insertion columns from aligned fasta file """ for seq in parse_fasta(fasta): seq[1] = ''.join([b for b in seq[1] if b == '-' or b.isupper()]) yield seq
def check_assembly(assembly, pr, threads, cov_thresh, \ mismatches, collection_mismatches, multiple, \ prefix, window, combine_windows, \ pr_split, allow_orphan, allow_orphan_ends, save_mapping): """ identify assembly errors """ # read assembly into memory scaffolds = { i[0].split('>')[1]: [i[0], len(i[1]), i[1]] for i in parse_fasta(assembly) if i != [[], []] } # map reads to assembly mapping, pr_split = map_reads(assembly, scaffolds, pr, threads, multiple, pr_split=pr_split) # identfy errors as zero coverage regions based on stringent mapping stringent_pairs, header = parse_mapping_stringent(mapping, scaffolds, mismatches) # * s2c[scaffold] = [per scaffold coverage at each position] # * s2errors = [positions with errors] s2c, s2errors = id_errors(stringent_pairs, header, assembly, \ scaffolds, cov_thresh, mismatches, allow_orphan = allow_orphan, \ allow_orphan_ends = allow_orphan_ends, save_mapping = save_mapping) # define windows s2windows = define_windows(scaffolds, s2errors, window, combine_windows) error_pairs = parse_mapping_errors(mapping, s2errors, s2windows) # collect reads that map to window # * reads[scaffold][error] = pe reads = collect_reads(error_pairs, assembly, scaffolds, collection_mismatches, prefix, s2errors, s2windows) return scaffolds, mapping, s2c, s2errors, reads, pr_split
def parse_catalytic(insertion, gff): """ parse catalytic RNAs to gff format """ offset = insertion['offset'] GeneStrand = insertion['strand'] if type(insertion['intron']) is not str: return gff for intron in parse_fasta(insertion['intron'].split('|')): ID, annot, strand, pos = intron[0].split('>')[1].split() Start, End = [int(i) for i in pos.split('-')] if strand != GeneStrand: if strand == '+': strand = '-' else: strand = '+' Start, End = End - 2, Start - 2 Start, End = abs(Start + offset) - 1, abs(End + offset) - 1 gff['#seqname'].append(insertion['ID']) gff['source'].append('Rfam') gff['feature'].append('Catalytic RNA') gff['start'].append(Start) gff['end'].append(End) gff['score'].append('.') gff['strand'].append(strand) gff['frame'].append('.') gff['attribute'].append('ID=%s; Name=%s' % (ID, annot)) return gff
def parse_insertion(insertion, gff): """ parse insertion to gff format """ offset = insertion['offset'] for ins in parse_fasta(insertion['insertion sequence'].split('|')): strand = insertion['strand'] ID = ins[0].split('>')[1].split()[0] Start, End = [ int(i) for i in ins[0].split('gene-pos=', 1)[1].split()[0].split('-') ] Start, End = abs(Start + offset), abs(End + offset) if strand == '-': Start, End = End, Start gff['#seqname'].append(insertion['ID']) gff['source'].append(insertion['source']) gff['feature'].append('IVS') gff['start'].append(Start) gff['end'].append(End) gff['score'].append('.') gff['strand'].append(strand) # same as rRNA gff['frame'].append('.') gff['attribute'].append('ID=%s' % (ID)) return gff
def parse_genomes(fastas, single): """ generator for parsing fastas if single is True, combine sequences in multifasta file """ if single is True: for genome in fastas: sequence = [] for seq in parse_fasta(genome): sequence.extend(list(seq[1].upper())) yield (genome.name.rsplit('.', 1)[0], len(sequence), sequence) else: for genome in fastas: for seq in parse_fasta(genome): ID = seq[0].split('>', 1)[1].split()[0] yield (ID, len(seq[1]), list(seq[1].upper()))
def genome_coverage(genomes, scaffold_coverage, total_bases): """ coverage = (number of bases / length of genome) * 100 """ coverage = {} custom = {} std = {} for genome in genomes: for sequence in parse_fasta(genome): scaffold = sequence[0].split('>')[1].split()[0] coverage, std = sum_coverage(coverage, std, genome, scaffold, sequence, scaffold_coverage) custom = calc_custom(custom, genome, scaffold, sequence, scaffold_coverage, total_bases) std = calc_std(std) custom_std = calc_std(custom) custom_av = {} for genome in custom: custom_av[genome] = [] for sample in custom[genome]: custom_av[genome].append(numpy.mean(sample)) for genome in coverage: print('%s\t%s' % (genome, coverage[genome][0][1])) if total_bases is True: total_bases = calc_total_mapped_bases(coverage) absolute = absolute_abundance(coverage, total_bases) for genome in coverage: calculated = [] for calc in coverage[genome]: calculated.append(calc[0] / calc[1]) coverage[genome] = calculated relative = relative_abundance(coverage) return coverage, std, absolute, relative, custom_av, custom_std
def analyze_fa(fa): """ analyze fa (names, insertions) and convert fasta to prodigal/cmscan safe file - find insertions (masked sequence) - make upper case - assign names to id number """ if fa.name == '<stdin>': safe = 'temp.id' else: safe = '%s.id' % (fa.name) safe = open(safe, 'w') sequences = {} # sequences[id] = sequence insertions = {} # insertions[id] = [[start, stop], [start, stop], ...] count = 0 id2name = {} names = [] for seq in parse_fasta(fa): id = '%010d' % (count, ) name = seq[0].split('>', 1)[1] id2name[id] = name id2name[name] = id names.append(name) insertions[id] = insertions_from_masked(seq[1]) sequences[id] = seq print('\n'.join(['>%s' % (id), seq[1].upper()]), file=safe) count += 1 safe.close() lookup = open('%s.id.lookup' % (fa.name), 'w') for i in list(id2name.items()): print('\t'.join(i), file=lookup) lookup.close() return safe.name, sequences, id2name, names, insertions
def concat_align(fastas): """ concatenate alignments """ # read in sequences fa2len = {} seqs = {} IDs = [] for fasta in fastas: seqs[fasta] = {} for seq in parse_fasta(fasta): ID = seq[0].split('>')[1].split()[0] IDs.append(ID) seqs[fasta][ID] = seq[1] fa2len[fasta] = len(seq[1]) # concat sequences IDs = set(IDs) concat = {} for fasta in fastas: for ID in IDs: if ID not in concat: concat[ID] = [] if ID not in seqs[fasta]: concat[ID].append('-' * fa2len[fasta]) else: concat[ID].append(seqs[fasta][ID]) return concat
def parse_orf(insertion, gff): """ parse ORF to gff format """ offset = insertion['offset'] if type(insertion['orf']) is not str: return gff for orf in parse_fasta(insertion['orf'].split('|')): ID = orf[0].split('>')[1].split()[0] Start, End, strand = [int(i) for i in orf[0].split(' # ')[1:4]] if strand == 1: strand = '+' else: strand = '-' GeneStrand = insertion['strand'] if strand != GeneStrand: if strand == '+': strand = '-' else: strand = '+' Start, End = End - 2, Start - 2 Start, End = abs(Start + offset) - 1, abs(End + offset) - 1 annot = orf[0].split()[1] if annot == 'n/a': annot = 'unknown' gff['#seqname'].append(insertion['ID']) gff['source'].append('Prodigal and Pfam') gff['feature'].append('CDS') gff['start'].append(Start) gff['end'].append(End) gff['score'].append('.') gff['strand'].append(strand) gff['frame'].append('.') gff['attribute'].append('ID=%s; Name=%s' % (ID, annot)) return gff
def fix_fasta(fasta): """ remove pesky characters from fasta file header """ for seq in parse_fasta(fasta): seq[0] = remove_char(seq[0]) if len(seq[1]) > 0: yield seq
def find_16S(fastas, hmms, bit_thresh=float(20), length_thresh=500, masking=True, buffer=0): """ 1) parse hmm output into dictionary (sequence must pass bit_thresh and inc == '!') seq2hmm[seq] = {model: [sstart, ssend, length, strand, score]} 2) determine which model (archaea, bacteria, eukarya) the sequence most closely matches seq2hmm[seq] = [model, sstart, send, length, strand, score], [model2, sstart2, send2, length2, strand2, score2], ...] 3) identify regions that match to 16S (for best model) 4) mask internal regions that do not align to model 5) length threshold applies to aligned regions of 16S sequence 5) export 16S sequnece based on complete gene (including masked insertions) """ # identify start/stop positions # group2hmm[seq][group] = [model, strand, coordinates, matches, gaps] group2hmm = find_coordinates(hmms, bit_thresh) # get sequences from fasta file for fasta in fastas: for seq in parse_fasta(fasta): id = seq[0].split('>')[1].split()[0] if id not in group2hmm: continue seq[1] = seq[1].upper() count = 0 # how many 16S genes are there on the contig? for group, info in list(group2hmm[id].items()): model, strand, coords, matches, gaps = info # count insertion bases (ib) from gaps ib = sum([i[1] - i[0] + 1 for i in gaps]) # calcualte length of non-insertion regions (don't include buffer) tl = coords[1] - coords[0] + 1 length = tl - ib if length < length_thresh: continue # count sequence count += 1 # set retrieval coords based on buffer ret_coords = [max([coords[0] - buffer, 1]), \ min([coords[1] + buffer, len(seq[1])]), coords[2]] buffer_ends = check_buffer(coords, len(seq[1]), buffer) # mask insertion sequences if masking is True: seq[1] = mask_sequence(seq[1], gaps) S = seq[1][(ret_coords[0] - 1):(ret_coords[1])] inserts = [gap[1] - gap[0] + 1 for gap in gaps] inserts.append('end') model_pos = ';'.join([ '%s-%s(%s)' % (match[2], match[3], insert) for match, insert in zip(matches, inserts) ]) header = '%s 16SfromHMM::model=%s seq=%s pos=%s-%s strand=%s total-len=%s 16S-len=%s model-pos(ins-len)=%s buffer-len=%s/%s ins-bases=%s' % \ (seq[0], model, count, ret_coords[0], ret_coords[1], strand, tl, length, model_pos, buffer_ends[0], buffer_ends[1], ib) # reverse complement if strand is reverse if strand == '-': S = rc(['', S])[1] yield [header, S]
def fa2s2b(fastas): """ convert fastas to s2b dictionary """ s2b = {} for fa in fastas: for seq in parse_fasta(fa): s = seq[0].split('>', 1)[1].split()[0] s2b[s] = fa.rsplit('/', 1)[-1].rsplit('.', 1)[0] return s2b
def idba_ud_seqs(fasta_name, idba_dir): """ get fasta file from idba_ud assembly directory, save to dictionary """ out = '%s/scaffold.fa' % (idba_dir) if os.path.exists(out) is False: out = '%s/contig.fa' % (idba_dir) re_assembled_seqs = \ {seq[0].split('>')[1].replace(' ', '_'): [seq[0].replace(' ', '_'), len(seq[1]), seq[1]] \ for seq in parse_fasta(out) if type(seq[0]) is str} os.system('cat %s | tr " " "_" > %s' % (out, fasta_name)) return re_assembled_seqs
def strip_masked(fasta, min_len, print_masked): """ remove masked regions from fasta file as long as they are longer than min_len """ for seq in parse_fasta(fasta): nm, masked = parse_masked(seq, min_len) nm = ['%s removed_masked >=%s' % (seq[0], min_len), ''.join(nm)] yield [0, nm] if print_masked is True: for i, m in enumerate([i for i in masked if i != []], 1): m = ['%s insertion:%s' % (seq[0], i), ''.join(m)] yield [1, m]
def shuffle_genome(genome, cat, fraction = float(100), plot = True, \ alpha = 0.1, beta = 100000, \ min_length = 1000, max_length = 200000): """ randomly shuffle genome """ header = '>randomized_%s' % (genome.name) sequence = list(''.join([i[1] for i in parse_fasta(genome)])) length = len(sequence) shuffled = [] # break genome into pieces while sequence is not False: s = int(random.gammavariate(alpha, beta)) if s <= min_length or s >= max_length: continue if len(sequence) < s: seq = sequence[0:] else: seq = sequence[0:s] sequence = sequence[s:] # if bool(random.getrandbits(1)) is True: # seq = rev_c(seq) # print('fragment length: %s reverse complement: True' % ('{:,}'.format(s)), file=sys.stderr) # else: # print('fragment length: %s reverse complement: False' % ('{:,}'.format(s)), file=sys.stderr) shuffled.append(''.join(seq)) if sequence == []: break # shuffle pieces random.shuffle(shuffled) # subset fragments if fraction == float(100): subset = shuffled else: max_pieces = int(length * fraction / 100) subset, total = [], 0 for fragment in shuffled: length = len(fragment) if total + length <= max_pieces: subset.append(fragment) total += length else: diff = max_pieces - total subset.append(fragment[0:diff]) break # combine sequences, if requested if cat is True: yield [header, ''.join(subset)] else: for i, seq in enumerate(subset): yield ['%s fragment:%s' % (header, i), seq]
def check_type(fasta): nucl = ['A', 'T', 'G', 'C'] junk = ['N', 'U', '.', '-', ' '] type = 'nucl' for seq in parse_fasta(fasta): seq = seq[1].upper() for residue in seq: if residue in junk: continue if residue not in nucl: type = 'prot' break break return type
def scaffolder(contigs, pr, pr_split, prefix, threads, mismatches, collection_mismatches=2, overlap=0.90): """ scaffold contigs based on overlap, paired read support, and paired read confirmation of scaffold 1) map reads to contigs (best mapping for each read) 2) re-assemble reads that map to ends of conitgs in order to find extensions 3) blast contigs and extensions against one another and find where ends overlap 4) create network of contig connections (require both paired read connections and overlap) 5) check all possible contig joins with paired read mapping 6) break edges in network if joinn is not supported by paired read mapping 7) traverse network to build scaffolds """ # prepare scaffolder directory os.system('mkdir -p %s' % (prefix)) contigs = curate.format_assembly(contigs, prefix) # save contigs to dictionary: contigs = [file_path, contigs[id] = [header, length, sequence]] contigs = [ contigs, { i[0].split('>')[1]: [i[0], len(i[1]), i[1]] for i in parse_fasta(contigs) if i != [[], []] } ] # map reads to contigs mapping = curate.map_reads(\ contigs[0], contigs[1], pr, threads, multiple = False, pr_split = pr_split) # find possible contig extensions extensions = assemble_extensions(contigs, mapping, mismatches, collection_mismatches, threads, prefix) combined = combine_fastas([contigs[1], extensions], '%s/combined.fa' % (prefix)) # find overlap between contigs and extensions blast_out = search(combined, combined, method = 'blast', \ max_hits = 100, threads = threads, prefix = '%s/' % (prefix)) # create network of contig connections graph, joins, id2scaffolds = network_contigs(contigs[1], extensions, blast_out, overlap, prefix) # check joined contigs with paired read mapping graph = filter_graph_mapping(joins, graph, pr, threads, prefix, pr_split, mismatches) # find best path through graph graph = filter_graph_best(graph, prefix) # scaffold contigs based on graph scaffolds = scaffold_contigs(contigs[1], id2scaffolds, graph, prefix)
def find_genes(fastas): index = 0 for fasta in fastas: previous = 0 for sequence in parse_fasta(fasta): header = sequence[0].split('>')[1].split(' ', 1) id = header[0] if len(header) > 1: description = header[1] else: description = 'n/a' genes[id] = [[id, fasta, description, previous], {}] g2index[id] = index index += 1 previous = id
def save_refs(sam, fastas, genomes, s2b): """ genomes = {} # genomes[genome][contig][sample] = {'bp_stats':[]} """ if s2b is False: s2b = {} for fasta in fastas: # if no reference sequence supplied, get length from sam file if fasta is False: for line in sam: if line.startswith('@PG'): break if line.startswith('@SQ') is False: continue line = line.strip().split() contig, length = line[1].split(':', 1)[1], int(line[2].split( ':', 1)[1]) if contig not in s2b: genome = 'n/a' s2b[contig] = genome else: genome = s2b[contig] if genome not in genomes: genomes[genome] = {} if contig not in genomes[genome]: genomes[genome][contig] = {} bp_stats = [] for i in range(0, length): bp_stats.append({'A':0, 'T':0, 'G':0, 'C':0, 'N':0, \ 'In':[], 'Del':[], 'ref':'N'}) genomes[genome][contig][sam.name] = {'bp_stats': bp_stats} return genomes, s2b # save reference sequences, if available genome = fasta.rsplit('.', 1)[0] if genome not in genomes: genomes[genome] = {} bp_stats = [] for seq in parse_fasta(fasta): contig = seq[0].split('>')[1].split()[0] s2b[contig] = genome if contig not in genomes[genome]: genomes[genome][contig] = {} bp_stats = [] for base in seq[1]: bp_stats.append({'A':0, 'T':0, 'G':0, 'C':0, 'N':0, \ 'In':[], 'Del':[], 'ref':base.upper()}) genomes[genome][contig][sam.name] = {'bp_stats': bp_stats} return genomes, s2b
def split_fasta(f, id2f): """ split fasta file into separate fasta files based on list of scaffolds that belong to each separate file """ opened = {} for seq in parse_fasta(f): id = seq[0].split('>')[1].split()[0] if id not in id2f: continue fasta = id2f[id] if fasta not in opened: opened[fasta] = '%s.fa' % fasta seq[1] += '\n' with open(opened[fasta], 'a+') as f_out: f_out.write('\n'.join(seq))
def get_descriptions(fastas): """ get the description for each ORF """ id2desc = {} for fasta in fastas: for seq in parse_fasta(fasta): header = seq[0].split('>')[1].split(' ') id = header[0] if len(header) > 1: desc = ' '.join(header[1:]) else: desc = 'n/a' length = float(len([i for i in seq[1].strip() if i != '*'])) id2desc[id] = [fasta, desc, length] return id2desc
def re_assemble_velvet(pr, prefix, scaffold, error, scaffolding, min_contig): """ re-assemble reads using velvet """ out = '%s/s_%s_e_%s' % (prefix, scaffold, error) velvet(paired = [pr], out = out, scaffolding = scaffolding, \ silent = True, min_contig = min_contig, kmer_min = 21, kmer_max = 71, kmer_increase = 10) re_assembled_seqs = {} assembled_fasta = open('%s/velvet_s-%s.e_%s.fa' % (out, scaffold, error), 'w') for fasta in glob('%s/*.fasta' % (out)): for seq in parse_fasta(fasta): if seq[0] == []: continue re_assembled_seqs[seq[0].split('>')[1]] = [seq[0], len(seq[1]), seq[1]] print('\n'.join(seq), file=assembled_fasta) assembled_fasta.close() return assembled_fasta.name, re_assembled_seqs
def genome_lengths(fastas, info): """ get genome lengths """ if info is False: info = {} for genome in fastas: name = genome.rsplit('.', 1)[0].rsplit('/', 1)[-1].rsplit('.contigs')[0] if name in info: continue length = 0 fragments = 0 for seq in parse_fasta(genome): length += len(seq[1]) fragments += 1 info[name] = {'genome size (bp)': length, '# contigs': fragments} return info
def parse_fasta_annotations(fastas, annot_tables, trans_table): """ parse gene call information from Prodigal fasta output """ if annot_tables is not False: annots = {} for table in annot_tables: for cds in open(table): ID, start, end, strand = cds.strip().split() annots[ID] = [start, end, int(strand)] for fasta in fastas: for seq in parse_fasta(fasta): if ('# ;gc_cont' not in seq[0] and '# ID=' not in seq[0]) and annot_tables is False: print( '# specify fasta from Prodigal or annotations table (-t)', file=sys.stderr) exit() if 'ID=' in seq[0]: ID = seq[0].rsplit('ID=', 1)[1].split(';', 1)[0] contig = seq[0].split()[0].split('>')[1].rsplit( '_%s' % (ID), 1)[0] else: contig = seq[0].split()[0].split('>')[1].rsplit('_', 1)[0] locus = seq[0].split()[0].split('>')[1] # annotation info from Prodigal if ('# ;gc_cont' in seq[0] or '# ID=' in seq[0]): info = seq[0].split(' # ') start, end, strand = int(info[1]), int(info[2]), info[3] if strand == '1': strand = 1 else: strand = -1 product = [''.join(info[4].split()[1:])] # annotation info from table else: start, end, strand = annots[locus] product = seq[0].split(' ', 1)[1] info = {'transl_table':[trans_table], \ 'translation':[seq[1]], \ 'product':product} yield contig, [locus, [start, end, strand], info]
def re_assemble_velvet(pr, prefix, scaffold, error, re_assembled_fasta, scaffolding, min_contig): """ re-assemble reads using velvet """ out = '%s/s_%s_e_%s' % (prefix, scaffold, error) out = out[0:100] # make sure file name is not too long velvet(paired = [pr], out = out, scaffolding = scaffolding, \ silent = True, min_contig = min_contig, kmer_min = 21, kmer_max = 71, kmer_increase = 10) re_assembled_seqs = {} for fasta in glob('%s/*.fasta' % (out)): for seq in parse_fasta(fasta): if seq[0] == []: continue seq[0] = '>%s_e:%s_%s' % (scaffold, error, seq[0].split('>')[1]) re_assembled_seqs[seq[0].split('>')[1]] = [ seq[0], len(seq[1]), seq[1] ] print('\n'.join(seq), file=re_assembled_fasta) return re_assembled_seqs
def six_frame(genome, table, minimum=10): """ translate each sequence into six reading frames """ for seq in parse_fasta(genome): dna = Seq(seq[1].upper().replace('U', 'T'), IUPAC.ambiguous_dna) counter = 0 for sequence in ['f', dna], ['rc', dna.reverse_complement()]: direction, sequence = sequence for frame in range(0, 3): for prot in \ sequence[frame:].\ translate(table = table, to_stop = False).split('*'): if len(prot) < minimum: continue counter += 1 header = '%s_%s table=%s frame=%s-%s %s' % \ (seq[0].split()[0], counter, table, frame+1, \ direction, ' '.join(seq[0].split()[1:])) yield [header, prot]
def de_rep(fastas, append_index, return_original = False): """ de-replicate fastas based on sequence names """ ids = [] for fasta in fastas: for seq in parse_fasta(fasta): header = seq[0].split('>')[1].split() id = header[0] if id not in ids: ids.append(id) if return_original is True: yield [header, seq] else: yield seq elif append_index == True: new, ids = append_index_id(id, ids) if return_original is True: yield [header, ['>%s %s' % (new, ' '.join(header[1::])), seq[1]]] else: yield ['>%s %s' % (new, ' '.join(header[1::])), seq[1]]
def check_assembly(assembly, pr, threads, cov_thresh, mismatches, collection_mismatches, multiple, prefix, window = False, combine_windows = False, pr_split = False, allow_orphan = False, allow_orphan_ends = False): """ check assembly for mismatches """ # read assembly into memory scaffolds = {i[0].split('>')[1]: [i[0], len(i[1]), i[1]] for i in parse_fasta(assembly) if i != [[], []]} # map reads to assembly mapping, pr_split = map_reads(assembly, scaffolds, pr, threads, multiple, pr_split = pr_split) # identfy errors (no coverage regions) based on stringent mapping # * s2errors = [positions with errors] # * s2c[scaffold] = [per scaffold coverage at each position] # * filtered_mapping_both = sam file requiring both reads to map within mismatch criteria pairs, header = parse_mapping(mapping) s2c, s2errors = id_errors(pairs, header, assembly, scaffolds, cov_thresh, mismatches, allow_orphan = allow_orphan, allow_orphan_ends = allow_orphan_ends) # collect reads that map to either window or entire contigs # * filtered_mapping_one = sam file requiring one of paired reads to map within mismatch criteria # * reads[scaffold][error] = pe if window is False: reads = collect_reads(pairs, assembly, scaffolds, collection_mismatches, prefix) else: reads = collect_reads(pairs, assembly, scaffolds, collection_mismatches, prefix, s2errors, window, combine_windows) return scaffolds, mapping, s2c, s2errors, reads, pr_split
def parse_genomes_fa(fastas, mappings): """ genomes[genome name] = {order: [contig order], samples: {}} samples[sample name] = {cov: [coverage by position], contigs: {}} contigs[contig name] = [coverage by position] """ id2g = {} # contig ID to genome lookup genomes = {} # dictionary for saving genome info for genome in fastas: name = genome.name samples = {s[0]:{'contigs':{}, 'cov':[]} for s in mappings} g = genomes[name] = {'order':[], 'samples':samples} g['len'] = 0 for seq in parse_fasta(genome): ID = seq[0].split('>', 1)[1].split()[0] g['order'].append(ID) id2g[ID] = name length = len(seq[1]) g['len'] += length for sample in list(samples.keys()): g['samples'][sample]['contigs'][ID] = \ [0 for i in range(0, length)] return genomes, id2g