def parse_readsRC(reads_file):
    """
    parse reads file and return both
    the sequence and its reverse complement
    """
    COMP = dict(
        zip(['A', 'T', 'G', 'C', 'N', '-'], ['T', 'A', 'C', 'G', 'N', '-']))
    for line in reads_file:
        if line.startswith('>'):
            file_type = 'fasta'
            break
        file_type = 'fastq'
        break
    reads_file.seek(0)
    if file_type == 'fasta':
        for seq in parse_fasta(reads_file):
            # return sequence and reverse complement
            seq = seq[1].upper()
            yield seq
            yield ''.join([COMP[base] for base in seq[::-1]])
    else:
        for seq in parse_fastq(reads_file):
            # return sequence and reverse complement
            seq = seq[1]
            yield seq
            yield ''.join([COMP[base] for base in seq[::-1]])
Exemple #2
0
def parse_genomes(fastas, single):
    """
    generator for parsing fastas
    if single is True, combine sequences in multifasta file
    """
    if single is True:
        for genome in fastas:
            sequence = []
            for seq in parse_fasta(genome): 
                sequence.extend(list(seq[1].upper()))
            yield (genome.name.rsplit('.', 1)[0], len(sequence), sequence)
    else:
        for genome in fastas:
            for seq in parse_fasta(genome):
                ID = seq[0].split('>', 1)[1].split()[0]
                yield (ID, len(seq[1]), list(seq[1].upper()))
Exemple #3
0
def scaffold_hits(searches, fasta, max_hits):
    """
    get hits from each search against each RP
    scaffolds[scaffold] = # ORfs
    s2rp[scaffold] = {rp:[hits]}
    """
    # initialize
    ## scaffolds[scaffold] = # ORFs
    scaffolds = {}
    for seq in parse_fasta(fasta):
        scaffold = seq[0].split()[0].split('>', 1)[1].rsplit('_', 1)[0]
        if scaffold not in scaffolds:
            scaffolds[scaffold] = 0
        scaffolds[scaffold] += 1
    s2rp = {s: {r[0]: [] for r in searches} for s in scaffolds}
    # get hits from blast
    for search in searches:
        rp, blast = search
        hits = [
            i
            for i in numblast(open(blast), max_hits, evalue_thresh, bit_thresh)
        ]
        for hit in hits:
            s = hit[0].split()[0].rsplit('_', 1)[0]
            hit[10], hit[11] = float(hit[10]), float(hit[11])
            s2rp[s][rp].append(hit)
    return scaffolds, s2rp
Exemple #4
0
def strip_inserts(fasta):
    """
    remove insertion columns from aligned fasta file
    """
    for seq in parse_fasta(fasta):
        seq[1] = ''.join([b for b in seq[1] if b == '-' or b.isupper()])
        yield seq
def check_assembly(assembly, pr, threads, cov_thresh, \
        mismatches, collection_mismatches, multiple, \
        prefix, window, combine_windows, \
        pr_split, allow_orphan, allow_orphan_ends, save_mapping):
    """
    identify assembly errors
    """
    # read assembly into memory
    scaffolds = {
        i[0].split('>')[1]: [i[0], len(i[1]), i[1]]
        for i in parse_fasta(assembly) if i != [[], []]
    }
    # map reads to assembly
    mapping, pr_split = map_reads(assembly,
                                  scaffolds,
                                  pr,
                                  threads,
                                  multiple,
                                  pr_split=pr_split)
    # identfy errors as zero coverage regions based on stringent mapping
    stringent_pairs, header = parse_mapping_stringent(mapping, scaffolds,
                                                      mismatches)
    # * s2c[scaffold] = [per scaffold coverage at each position]
    # * s2errors = [positions with errors]
    s2c, s2errors = id_errors(stringent_pairs, header, assembly, \
            scaffolds, cov_thresh, mismatches, allow_orphan = allow_orphan, \
            allow_orphan_ends = allow_orphan_ends, save_mapping = save_mapping)
    # define windows
    s2windows = define_windows(scaffolds, s2errors, window, combine_windows)
    error_pairs = parse_mapping_errors(mapping, s2errors, s2windows)
    # collect reads that map to window
    # * reads[scaffold][error] = pe
    reads = collect_reads(error_pairs, assembly, scaffolds,
                          collection_mismatches, prefix, s2errors, s2windows)
    return scaffolds, mapping, s2c, s2errors, reads, pr_split
def parse_catalytic(insertion, gff):
    """
    parse catalytic RNAs to gff format
    """
    offset = insertion['offset']
    GeneStrand = insertion['strand']
    if type(insertion['intron']) is not str:
        return gff
    for intron in parse_fasta(insertion['intron'].split('|')):
        ID, annot, strand, pos = intron[0].split('>')[1].split()
        Start, End = [int(i) for i in pos.split('-')]
        if strand != GeneStrand:
            if strand == '+':
                strand = '-'
            else:
                strand = '+'
            Start, End = End - 2, Start - 2
        Start, End = abs(Start + offset) - 1, abs(End + offset) - 1
        gff['#seqname'].append(insertion['ID'])
        gff['source'].append('Rfam')
        gff['feature'].append('Catalytic RNA')
        gff['start'].append(Start)
        gff['end'].append(End)
        gff['score'].append('.')
        gff['strand'].append(strand)
        gff['frame'].append('.')
        gff['attribute'].append('ID=%s; Name=%s' % (ID, annot))
    return gff
def parse_insertion(insertion, gff):
    """
    parse insertion to gff format
    """
    offset = insertion['offset']
    for ins in parse_fasta(insertion['insertion sequence'].split('|')):
        strand = insertion['strand']
        ID = ins[0].split('>')[1].split()[0]
        Start, End = [
            int(i)
            for i in ins[0].split('gene-pos=', 1)[1].split()[0].split('-')
        ]
        Start, End = abs(Start + offset), abs(End + offset)
        if strand == '-':
            Start, End = End, Start
        gff['#seqname'].append(insertion['ID'])
        gff['source'].append(insertion['source'])
        gff['feature'].append('IVS')
        gff['start'].append(Start)
        gff['end'].append(End)
        gff['score'].append('.')
        gff['strand'].append(strand)  # same as rRNA
        gff['frame'].append('.')
        gff['attribute'].append('ID=%s' % (ID))
    return gff
Exemple #8
0
def parse_genomes(fastas, single):
    """
    generator for parsing fastas
    if single is True, combine sequences in multifasta file
    """
    if single is True:
        for genome in fastas:
            sequence = []
            for seq in parse_fasta(genome):
                sequence.extend(list(seq[1].upper()))
            yield (genome.name.rsplit('.', 1)[0], len(sequence), sequence)
    else:
        for genome in fastas:
            for seq in parse_fasta(genome):
                ID = seq[0].split('>', 1)[1].split()[0]
                yield (ID, len(seq[1]), list(seq[1].upper()))
def genome_coverage(genomes, scaffold_coverage, total_bases):
	"""
	coverage = (number of bases / length of genome) * 100
	"""
	coverage = {}
	custom = {}
	std = {}
	for genome in genomes:
		for sequence in parse_fasta(genome):
			scaffold = sequence[0].split('>')[1].split()[0]
			coverage, std = sum_coverage(coverage, std, genome, scaffold, sequence, scaffold_coverage)
			custom = calc_custom(custom, genome, scaffold, sequence, scaffold_coverage, total_bases)
	std = calc_std(std)
	custom_std = calc_std(custom)
	custom_av = {}
	for genome in custom:
		custom_av[genome] = []
		for sample in custom[genome]:
			custom_av[genome].append(numpy.mean(sample))
	for genome in coverage:
		print('%s\t%s' % (genome, coverage[genome][0][1]))
	if total_bases is True:
		total_bases = calc_total_mapped_bases(coverage)
	absolute = absolute_abundance(coverage, total_bases)
	for genome in coverage:
		calculated = []
		for calc in coverage[genome]:
			calculated.append(calc[0] / calc[1])
		coverage[genome] = calculated
	relative = relative_abundance(coverage)
	return coverage, std, absolute, relative, custom_av, custom_std
def analyze_fa(fa):
    """
    analyze fa (names, insertions) and convert fasta to prodigal/cmscan safe file
    - find insertions (masked sequence)
    - make upper case
    - assign names to id number
    """
    if fa.name == '<stdin>':
        safe = 'temp.id'
    else:
        safe = '%s.id' % (fa.name)
    safe = open(safe, 'w')
    sequences = {}  # sequences[id] = sequence
    insertions = {}  # insertions[id] = [[start, stop], [start, stop], ...]
    count = 0
    id2name = {}
    names = []
    for seq in parse_fasta(fa):
        id = '%010d' % (count, )
        name = seq[0].split('>', 1)[1]
        id2name[id] = name
        id2name[name] = id
        names.append(name)
        insertions[id] = insertions_from_masked(seq[1])
        sequences[id] = seq
        print('\n'.join(['>%s' % (id), seq[1].upper()]), file=safe)
        count += 1
    safe.close()
    lookup = open('%s.id.lookup' % (fa.name), 'w')
    for i in list(id2name.items()):
        print('\t'.join(i), file=lookup)
    lookup.close()
    return safe.name, sequences, id2name, names, insertions
def concat_align(fastas):
    """
    concatenate alignments
    """
    # read in sequences
    fa2len = {}
    seqs = {}
    IDs = []
    for fasta in fastas:
        seqs[fasta] = {}
        for seq in parse_fasta(fasta):
            ID = seq[0].split('>')[1].split()[0]
            IDs.append(ID)
            seqs[fasta][ID] = seq[1]
        fa2len[fasta] = len(seq[1])
    # concat sequences
    IDs = set(IDs)
    concat = {}
    for fasta in fastas:
        for ID in IDs:
            if ID not in concat:
                concat[ID] = []
            if ID not in seqs[fasta]:
                concat[ID].append('-' * fa2len[fasta])
            else:
                concat[ID].append(seqs[fasta][ID])
    return concat
def parse_orf(insertion, gff):
    """
    parse ORF to gff format
    """
    offset = insertion['offset']
    if type(insertion['orf']) is not str:
        return gff
    for orf in parse_fasta(insertion['orf'].split('|')):
        ID = orf[0].split('>')[1].split()[0]
        Start, End, strand = [int(i) for i in orf[0].split(' # ')[1:4]]
        if strand == 1:
            strand = '+'
        else:
            strand = '-'
        GeneStrand = insertion['strand']
        if strand != GeneStrand:
            if strand == '+':
                strand = '-'
            else:
                strand = '+'
            Start, End = End - 2, Start - 2
        Start, End = abs(Start + offset) - 1, abs(End + offset) - 1
        annot = orf[0].split()[1]
        if annot == 'n/a':
            annot = 'unknown'
        gff['#seqname'].append(insertion['ID'])
        gff['source'].append('Prodigal and Pfam')
        gff['feature'].append('CDS')
        gff['start'].append(Start)
        gff['end'].append(End)
        gff['score'].append('.')
        gff['strand'].append(strand)
        gff['frame'].append('.')
        gff['attribute'].append('ID=%s; Name=%s' % (ID, annot))
    return gff
Exemple #13
0
def fix_fasta(fasta):
    """
    remove pesky characters from fasta file header
    """
    for seq in parse_fasta(fasta):
        seq[0] = remove_char(seq[0])
        if len(seq[1]) > 0:
            yield seq
Exemple #14
0
def find_16S(fastas,
             hmms,
             bit_thresh=float(20),
             length_thresh=500,
             masking=True,
             buffer=0):
    """
    1) parse hmm output into dictionary (sequence must pass bit_thresh and inc == '!')
        seq2hmm[seq] = {model: [sstart, ssend, length, strand, score]} 
    2) determine which model (archaea, bacteria, eukarya) the sequence most closely matches
        seq2hmm[seq] = [model, sstart, send, length, strand, score], [model2, sstart2, send2, length2, strand2, score2], ...]
    3) identify regions that match to 16S (for best model)
    4) mask internal regions that do not align to model
    5) length threshold applies to aligned regions of 16S sequence
    5) export 16S sequnece based on complete gene (including masked insertions)
    """
    # identify start/stop positions
    # group2hmm[seq][group] = [model, strand, coordinates, matches, gaps]
    group2hmm = find_coordinates(hmms, bit_thresh)
    # get sequences from fasta file
    for fasta in fastas:
        for seq in parse_fasta(fasta):
            id = seq[0].split('>')[1].split()[0]
            if id not in group2hmm:
                continue
            seq[1] = seq[1].upper()
            count = 0  # how many 16S genes are there on the contig?
            for group, info in list(group2hmm[id].items()):
                model, strand, coords, matches, gaps = info
                # count insertion bases (ib) from gaps
                ib = sum([i[1] - i[0] + 1 for i in gaps])
                # calcualte length of non-insertion regions (don't include buffer)
                tl = coords[1] - coords[0] + 1
                length = tl - ib
                if length < length_thresh:
                    continue
                # count sequence
                count += 1
                # set retrieval coords based on buffer
                ret_coords = [max([coords[0] - buffer, 1]), \
                        min([coords[1] + buffer, len(seq[1])]), coords[2]]
                buffer_ends = check_buffer(coords, len(seq[1]), buffer)
                # mask insertion sequences
                if masking is True:
                    seq[1] = mask_sequence(seq[1], gaps)
                S = seq[1][(ret_coords[0] - 1):(ret_coords[1])]
                inserts = [gap[1] - gap[0] + 1 for gap in gaps]
                inserts.append('end')
                model_pos = ';'.join([
                    '%s-%s(%s)' % (match[2], match[3], insert)
                    for match, insert in zip(matches, inserts)
                ])
                header = '%s 16SfromHMM::model=%s seq=%s pos=%s-%s strand=%s total-len=%s 16S-len=%s model-pos(ins-len)=%s buffer-len=%s/%s ins-bases=%s' % \
                        (seq[0], model, count, ret_coords[0], ret_coords[1], strand, tl, length, model_pos, buffer_ends[0], buffer_ends[1], ib)
                # reverse complement if strand is reverse
                if strand == '-':
                    S = rc(['', S])[1]
                yield [header, S]
def fa2s2b(fastas):
    """
    convert fastas to s2b dictionary
    """
    s2b = {}
    for fa in fastas:
        for seq in parse_fasta(fa):
            s = seq[0].split('>', 1)[1].split()[0]
            s2b[s] = fa.rsplit('/', 1)[-1].rsplit('.', 1)[0]
    return s2b
Exemple #16
0
def idba_ud_seqs(fasta_name, idba_dir):
    """
    get fasta file from idba_ud assembly directory, save to dictionary
    """
    out = '%s/scaffold.fa' % (idba_dir)
    if os.path.exists(out) is False:
        out = '%s/contig.fa' % (idba_dir)
    re_assembled_seqs = \
            {seq[0].split('>')[1].replace(' ', '_'): [seq[0].replace(' ', '_'), len(seq[1]), seq[1]] \
            for seq in parse_fasta(out) if type(seq[0]) is str}
    os.system('cat %s | tr " " "_" > %s' % (out, fasta_name))
    return re_assembled_seqs
def strip_masked(fasta, min_len, print_masked):
    """
    remove masked regions from fasta file as long as
    they are longer than min_len
    """
    for seq in parse_fasta(fasta):
        nm, masked = parse_masked(seq, min_len)
        nm = ['%s removed_masked >=%s' % (seq[0], min_len), ''.join(nm)]
        yield [0, nm]
        if print_masked is True:
            for i, m in enumerate([i for i in masked if i != []], 1):
                m = ['%s insertion:%s' % (seq[0], i), ''.join(m)]
                yield [1, m]
Exemple #18
0
def shuffle_genome(genome, cat, fraction = float(100), plot = True, \
        alpha = 0.1, beta = 100000, \
        min_length = 1000, max_length = 200000):
    """
    randomly shuffle genome
    """
    header = '>randomized_%s' % (genome.name)
    sequence = list(''.join([i[1] for i in parse_fasta(genome)]))
    length = len(sequence)
    shuffled = []
    # break genome into pieces
    while sequence is not False:
        s = int(random.gammavariate(alpha, beta))
        if s <= min_length or s >= max_length:
            continue
        if len(sequence) < s:
            seq = sequence[0:]
        else:
            seq = sequence[0:s]
        sequence = sequence[s:]
        #        if bool(random.getrandbits(1)) is True:
        #            seq = rev_c(seq)
        #            print('fragment length: %s reverse complement: True' % ('{:,}'.format(s)), file=sys.stderr)
        #        else:
        #            print('fragment length: %s reverse complement: False' % ('{:,}'.format(s)), file=sys.stderr)
        shuffled.append(''.join(seq))
        if sequence == []:
            break
    # shuffle pieces
    random.shuffle(shuffled)
    # subset fragments
    if fraction == float(100):
        subset = shuffled
    else:
        max_pieces = int(length * fraction / 100)
        subset, total = [], 0
        for fragment in shuffled:
            length = len(fragment)
            if total + length <= max_pieces:
                subset.append(fragment)
                total += length
            else:
                diff = max_pieces - total
                subset.append(fragment[0:diff])
                break
    # combine sequences, if requested
    if cat is True:
        yield [header, ''.join(subset)]
    else:
        for i, seq in enumerate(subset):
            yield ['%s fragment:%s' % (header, i), seq]
Exemple #19
0
def check_type(fasta):
    nucl = ['A', 'T', 'G', 'C']
    junk = ['N', 'U', '.', '-', ' ']
    type = 'nucl'
    for seq in parse_fasta(fasta):
        seq = seq[1].upper()
        for residue in seq:
            if residue in junk:
                continue
            if residue not in nucl:
                type = 'prot'
            break
        break
    return type
Exemple #20
0
def scaffolder(contigs,
               pr,
               pr_split,
               prefix,
               threads,
               mismatches,
               collection_mismatches=2,
               overlap=0.90):
    """
    scaffold contigs based on overlap, paired read support, and paired read confirmation of scaffold
    1) map reads to contigs (best mapping for each read)
    2) re-assemble reads that map to ends of conitgs in order to find extensions
    3) blast contigs and extensions against one another and find where ends overlap
    4) create network of contig connections (require both paired read connections and overlap)
    5) check all possible contig joins with paired read mapping
    6) break edges in network if joinn is not supported by paired read mapping
    7) traverse network to build scaffolds
    """
    # prepare scaffolder directory
    os.system('mkdir -p %s' % (prefix))
    contigs = curate.format_assembly(contigs, prefix)
    # save contigs to dictionary: contigs = [file_path, contigs[id] = [header, length, sequence]]
    contigs = [
        contigs,
        {
            i[0].split('>')[1]: [i[0], len(i[1]), i[1]]
            for i in parse_fasta(contigs) if i != [[], []]
        }
    ]
    # map reads to contigs
    mapping = curate.map_reads(\
            contigs[0], contigs[1], pr, threads, multiple = False, pr_split = pr_split)
    # find possible contig extensions
    extensions = assemble_extensions(contigs, mapping, mismatches,
                                     collection_mismatches, threads, prefix)
    combined = combine_fastas([contigs[1], extensions],
                              '%s/combined.fa' % (prefix))
    # find overlap between contigs and extensions
    blast_out = search(combined, combined, method = 'blast', \
            max_hits = 100, threads = threads, prefix = '%s/' % (prefix))
    # create network of contig connections
    graph, joins, id2scaffolds = network_contigs(contigs[1], extensions,
                                                 blast_out, overlap, prefix)
    # check joined contigs with paired read mapping
    graph = filter_graph_mapping(joins, graph, pr, threads, prefix, pr_split,
                                 mismatches)
    # find best path through graph
    graph = filter_graph_best(graph, prefix)
    # scaffold contigs based on graph
    scaffolds = scaffold_contigs(contigs[1], id2scaffolds, graph, prefix)
Exemple #21
0
def find_genes(fastas):
    index = 0
    for fasta in fastas:
        previous = 0
        for sequence in parse_fasta(fasta):
            header = sequence[0].split('>')[1].split(' ', 1)
            id = header[0]
            if len(header) > 1:
                description = header[1]
            else:
                description = 'n/a'
            genes[id] = [[id, fasta, description, previous], {}]
            g2index[id] = index
            index += 1
            previous = id
def save_refs(sam, fastas, genomes, s2b):
    """
    genomes = {} # genomes[genome][contig][sample] = {'bp_stats':[]}
    """
    if s2b is False:
        s2b = {}
    for fasta in fastas:
        # if no reference sequence supplied, get length from sam file
        if fasta is False:
            for line in sam:
                if line.startswith('@PG'):
                    break
                if line.startswith('@SQ') is False:
                    continue
                line = line.strip().split()
                contig, length = line[1].split(':', 1)[1], int(line[2].split(
                    ':', 1)[1])
                if contig not in s2b:
                    genome = 'n/a'
                    s2b[contig] = genome
                else:
                    genome = s2b[contig]
                if genome not in genomes:
                    genomes[genome] = {}
                if contig not in genomes[genome]:
                    genomes[genome][contig] = {}
                bp_stats = []
                for i in range(0, length):
                    bp_stats.append({'A':0, 'T':0, 'G':0, 'C':0, 'N':0, \
                                        'In':[], 'Del':[], 'ref':'N'})
                genomes[genome][contig][sam.name] = {'bp_stats': bp_stats}
                return genomes, s2b
        # save reference sequences, if available
        genome = fasta.rsplit('.', 1)[0]
        if genome not in genomes:
            genomes[genome] = {}
        bp_stats = []
        for seq in parse_fasta(fasta):
            contig = seq[0].split('>')[1].split()[0]
            s2b[contig] = genome
            if contig not in genomes[genome]:
                genomes[genome][contig] = {}
            bp_stats = []
            for base in seq[1]:
                bp_stats.append({'A':0, 'T':0, 'G':0, 'C':0, 'N':0, \
                                    'In':[], 'Del':[], 'ref':base.upper()})
            genomes[genome][contig][sam.name] = {'bp_stats': bp_stats}
        return genomes, s2b
Exemple #23
0
def split_fasta(f, id2f):
    """
    split fasta file into separate fasta files based on list of scaffolds
    that belong to each separate file
    """
    opened = {}
    for seq in parse_fasta(f):
        id = seq[0].split('>')[1].split()[0]
        if id not in id2f:
            continue
        fasta = id2f[id]
        if fasta not in opened:
            opened[fasta] = '%s.fa' % fasta
        seq[1] += '\n'
        with open(opened[fasta], 'a+') as f_out:
            f_out.write('\n'.join(seq))
Exemple #24
0
def get_descriptions(fastas):
    """
    get the description for each ORF 
    """
    id2desc = {}
    for fasta in fastas:
        for seq in parse_fasta(fasta):
            header = seq[0].split('>')[1].split(' ')
            id = header[0]
            if len(header) > 1:
                desc = ' '.join(header[1:])
            else:
                desc = 'n/a'
            length = float(len([i for i in seq[1].strip() if i != '*']))
            id2desc[id] = [fasta, desc, length]
    return id2desc
Exemple #25
0
def re_assemble_velvet(pr, prefix, scaffold, error, scaffolding, min_contig):
    """
    re-assemble reads using velvet
    """
    out = '%s/s_%s_e_%s' % (prefix, scaffold, error)
    velvet(paired = [pr], out = out, scaffolding = scaffolding, \
        silent = True, min_contig = min_contig, kmer_min = 21, kmer_max = 71, kmer_increase = 10)
    re_assembled_seqs = {}
    assembled_fasta = open('%s/velvet_s-%s.e_%s.fa' % (out, scaffold, error), 'w')
    for fasta in glob('%s/*.fasta' % (out)):
        for seq in parse_fasta(fasta):
            if seq[0] == []:
                continue
            re_assembled_seqs[seq[0].split('>')[1]] = [seq[0], len(seq[1]), seq[1]]
            print('\n'.join(seq), file=assembled_fasta)
    assembled_fasta.close()
    return assembled_fasta.name, re_assembled_seqs
def genome_lengths(fastas, info):
    """
    get genome lengths
    """
    if info is False:
        info = {}
    for genome in fastas:
        name = genome.rsplit('.', 1)[0].rsplit('/',
                                               1)[-1].rsplit('.contigs')[0]
        if name in info:
            continue
        length = 0
        fragments = 0
        for seq in parse_fasta(genome):
            length += len(seq[1])
            fragments += 1
        info[name] = {'genome size (bp)': length, '# contigs': fragments}
    return info
def parse_fasta_annotations(fastas, annot_tables, trans_table):
    """
    parse gene call information from Prodigal fasta output
    """
    if annot_tables is not False:
        annots = {}
        for table in annot_tables:
            for cds in open(table):
                ID, start, end, strand = cds.strip().split()
                annots[ID] = [start, end, int(strand)]
    for fasta in fastas:
        for seq in parse_fasta(fasta):
            if ('# ;gc_cont' not in seq[0]
                    and '# ID=' not in seq[0]) and annot_tables is False:
                print(
                    '# specify fasta from Prodigal or annotations table (-t)',
                    file=sys.stderr)
                exit()
            if 'ID=' in seq[0]:
                ID = seq[0].rsplit('ID=', 1)[1].split(';', 1)[0]
                contig = seq[0].split()[0].split('>')[1].rsplit(
                    '_%s' % (ID), 1)[0]
            else:
                contig = seq[0].split()[0].split('>')[1].rsplit('_', 1)[0]
            locus = seq[0].split()[0].split('>')[1]
            # annotation info from Prodigal
            if ('# ;gc_cont' in seq[0] or '# ID=' in seq[0]):
                info = seq[0].split(' # ')
                start, end, strand = int(info[1]), int(info[2]), info[3]
                if strand == '1':
                    strand = 1
                else:
                    strand = -1
                product = [''.join(info[4].split()[1:])]
            # annotation info from table
            else:
                start, end, strand = annots[locus]
                product = seq[0].split(' ', 1)[1]
            info = {'transl_table':[trans_table], \
                    'translation':[seq[1]], \
                    'product':product}
            yield contig, [locus, [start, end, strand], info]
def re_assemble_velvet(pr, prefix, scaffold, error, re_assembled_fasta,
                       scaffolding, min_contig):
    """
    re-assemble reads using velvet
    """
    out = '%s/s_%s_e_%s' % (prefix, scaffold, error)
    out = out[0:100]  # make sure file name is not too long
    velvet(paired = [pr], out = out, scaffolding = scaffolding, \
        silent = True, min_contig = min_contig, kmer_min = 21, kmer_max = 71, kmer_increase = 10)
    re_assembled_seqs = {}
    for fasta in glob('%s/*.fasta' % (out)):
        for seq in parse_fasta(fasta):
            if seq[0] == []:
                continue
            seq[0] = '>%s_e:%s_%s' % (scaffold, error, seq[0].split('>')[1])
            re_assembled_seqs[seq[0].split('>')[1]] = [
                seq[0], len(seq[1]), seq[1]
            ]
            print('\n'.join(seq), file=re_assembled_fasta)
    return re_assembled_seqs
Exemple #29
0
def six_frame(genome, table, minimum=10):
    """
    translate each sequence into six reading frames
    """
    for seq in parse_fasta(genome):
        dna = Seq(seq[1].upper().replace('U', 'T'), IUPAC.ambiguous_dna)
        counter = 0
        for sequence in ['f', dna], ['rc', dna.reverse_complement()]:
            direction, sequence = sequence
            for frame in range(0, 3):
                for prot in \
                            sequence[frame:].\
                            translate(table = table, to_stop = False).split('*'):
                    if len(prot) < minimum:
                        continue
                    counter += 1
                    header = '%s_%s table=%s frame=%s-%s %s' % \
                                (seq[0].split()[0], counter, table, frame+1, \
                                direction, ' '.join(seq[0].split()[1:]))
                    yield [header, prot]
Exemple #30
0
def de_rep(fastas, append_index, return_original = False):
    """
    de-replicate fastas based on sequence names
    """
    ids = []
    for fasta in fastas:
        for seq in parse_fasta(fasta):
            header = seq[0].split('>')[1].split()
            id = header[0]
            if id not in ids:
                ids.append(id)
                if return_original is True:
                    yield [header, seq]
                else:
                    yield seq
            elif append_index == True:
                new, ids = append_index_id(id, ids) 
                if return_original is True:
                    yield [header, ['>%s %s' % (new, ' '.join(header[1::])), seq[1]]]
                else:
                    yield ['>%s %s' % (new, ' '.join(header[1::])), seq[1]]
Exemple #31
0
def check_assembly(assembly, pr, threads, cov_thresh, mismatches, collection_mismatches, multiple, prefix, window = False, combine_windows = False, pr_split = False, allow_orphan = False, allow_orphan_ends = False):
    """
    check assembly for mismatches
    """
    # read assembly into memory
    scaffolds = {i[0].split('>')[1]: [i[0], len(i[1]), i[1]] for i in parse_fasta(assembly) if i != [[], []]}
    # map reads to assembly
    mapping, pr_split = map_reads(assembly, scaffolds, pr, threads, multiple, pr_split = pr_split)
    # identfy errors (no coverage regions) based on stringent mapping
    # * s2errors = [positions with errors]
    # * s2c[scaffold] = [per scaffold coverage at each position]
    # * filtered_mapping_both = sam file requiring both reads to map within mismatch criteria
    pairs, header = parse_mapping(mapping)
    s2c, s2errors = id_errors(pairs, header, assembly, scaffolds, cov_thresh, mismatches, allow_orphan = allow_orphan, allow_orphan_ends = allow_orphan_ends)
    # collect reads that map to either window or entire contigs
    # * filtered_mapping_one = sam file requiring one of paired reads to map within mismatch criteria
    # * reads[scaffold][error] = pe
    if window is False:
        reads = collect_reads(pairs, assembly, scaffolds, collection_mismatches, prefix)
    else:
        reads = collect_reads(pairs, assembly, scaffolds, collection_mismatches, prefix, s2errors, window, combine_windows)
    return scaffolds, mapping, s2c, s2errors, reads, pr_split
def parse_genomes_fa(fastas, mappings):
    """
    genomes[genome name] = {order: [contig order], samples: {}}
        samples[sample name] = {cov: [coverage by position], contigs: {}}
            contigs[contig name] = [coverage by position]
    """
    id2g = {} # contig ID to genome lookup
    genomes = {} # dictionary for saving genome info
    for genome in fastas:
        name = genome.name
        samples = {s[0]:{'contigs':{}, 'cov':[]} for s in mappings}
        g = genomes[name] = {'order':[], 'samples':samples}
        g['len'] = 0
        for seq in parse_fasta(genome):
            ID = seq[0].split('>', 1)[1].split()[0]
            g['order'].append(ID)
            id2g[ID] = name
            length = len(seq[1])
            g['len'] += length
            for sample in list(samples.keys()):
                g['samples'][sample]['contigs'][ID] = \
                    [0 for i in range(0, length)]
    return genomes, id2g