Exemple #1
0
def test_trim_below_abundance(AnyTabletype):
    hi = AnyTabletype(6)

    x = "ATGGCAGTAGCAGTGAGC"
    x_rc = screed.rc(x)
    hi.consume(x_rc[:10])

    print(len(x))

    (y, pos) = hi.trim_below_abundance(x, 0)
    assert pos == len(x) - hi.ksize() + 1
    assert x[:pos] == y
Exemple #2
0
def test_trim_below_abundance(AnyTabletype):
    hi = AnyTabletype(6)

    x = "ATGGCAGTAGCAGTGAGC"
    x_rc = screed.rc(x)
    hi.consume(x_rc[:10])

    print(len(x))

    (y, pos) = hi.trim_below_abundance(x, 0)
    assert pos == len(x) - hi.ksize() + 1
    assert x[:pos] == y
Exemple #3
0
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument('contigfile')
    parser.add_argument('blastz_alignment')
    parser.add_argument('-l', '--min-length', type=int, default=100)
    parser.add_argument('-b', '--boundary', type=int, default=5)
    args = parser.parse_args()

    sequences = {}
    for record in screed.open(args.contigfile):
        sequences[record.name] = record.sequence
        rc_name = record.name + " (reverse complement)"
        sequences[rc_name] = screed.rc(str(record.sequence))
    print >>sys.stderr, 'loaded %d sequences from %s' % (len(sequences), args.contigfile)

    fp = open(args.blastz_alignment)
    records = parse_blastz2.parse_blastz(fp, args.min_length)
    print >>sys.stderr, 'loaded %d records with min length %d' % (len(records), args.min_length)

    # make things unique by subject match
    uniq_records = [ (s_name, s_start, s_end) for (q, s_name, s_start, s_end) in records ]
    uniq_records = set(uniq_records)

    print >>sys.stderr, 'uniqified down to %d records' % (len(uniq_records),)

    for (s_name, s_start, s_end) in uniq_records:
        seq = sequences[s_name]
        
        b_start = max(s_start - 1 - args.boundary, 0)
        b_end = min(s_end - 1 + args.boundary, len(seq))
        
        interval = seq[b_start:b_end]
        s_short = s_name.split()[0]
        if 'reverse complement' in s_name:
            s_short += 'RC'
        print '>%s:%d-%d\n%s' % (s_short, b_start, b_end, interval)
def main(argv):
    parser = argparse.ArgumentParser()
    parser.add_argument('bcalm_unitigs')
    parser.add_argument('gxt_out')
    parser.add_argument('contigs_out')
    parser.add_argument('-k', '--ksize', type=int, default=31)
    parser.add_argument('-d', '--debug', action='store_true')
    parser.add_argument('-P', '--pendants', action="store_true",
                        help="don't remove low abundance pendants")
    parser.add_argument('-a', '--abundance', nargs='?', type=float,
                        default=1.1)
    parser.add_argument('--randomize', help='randomize cDBG order')
    args = parser.parse_args(argv)

    k = args.ksize

    trim = not args.pendants
    trim_cutoff = args.abundance
    unitigs = args.bcalm_unitigs
    debug = args.debug

    if args.debug:
        logging.basicConfig(filename='bcalm_to_gxt.log', filemode='w',
                            level=logging.DEBUG)
    else:
        logging.basicConfig(filename='bcalm_to_gxt.log', filemode='w',
                            level=logging.WARNING)

    logging.debug("starting bcalm_to_gxt run.")

    gxtfp = open(args.gxt_out, 'wt')
    contigsfp = bgzf.open(args.contigs_out, 'wb')
    info_filename = args.contigs_out + '.info.csv'
    info_fp = open(info_filename, 'wt')
    in_mh = sourmash.MinHash(0, 31, scaled=1000)
    out_mh = sourmash.MinHash(0, 31, scaled=1000)

    # load in the basic graph structure from the BCALM output file
    neighbors, sequences, mean_abunds, sizes = read_bcalm(unitigs, debug, k)

    # record input k-mers in a minhash
    for seq in sequences.values():
        in_mh.add_sequence(seq)

    # make order deterministic by reordering around min value of first, last,
    # and reverse complementing sequences appropriately
    print('reordering...')
    reordering = {}

    # first, put sequences in specific orientation
    sequence_list = []
    for key in neighbors:
        v = sequences[key]

        # pick lexicographically smaller of forward & reverse complement.
        v2 = screed.rc(v)
        if v > v2:
            v = v2
        sequence_list.append((v, key))
        del sequences[key]

    # sort all sequences:
    sequence_list.sort(reverse=True)
    if args.randomize:
        print('(!! randomizing order per --randomize !!)')
        random.shuffle(sequence_list)

    # ok, now remap all the things.
    remapping = {}
    new_sequences = {}

    # remap sequences
    new_key = 0
    while sequence_list:                  # consume while iterating
        sequence, old_key = sequence_list.pop()
        remapping[old_key] = new_key
        new_sequences[new_key] = sequence
        new_key += 1

    # remap other things
    new_neighbors = collections.defaultdict(set)
    for old_key, vv in neighbors.items():
        new_vv = [ remapping[v] for v in vv ]
        new_neighbors[remapping[old_key]] = set(new_vv)

    new_mean_abunds = {}
    for old_key, value in mean_abunds.items():
        new_mean_abunds[remapping[old_key]] = value

    new_sizes = {}
    for old_key, value in sizes.items():
        new_sizes[remapping[old_key]] = value

    assert len(sequences) == 0
    print('...done')

    sequences = new_sequences
    mean_abunds = new_mean_abunds
    sizes = new_sizes
    neighbors = new_neighbors

    # if we are removing pendants, we need to relabel the contigs so they are
    # consecutive integers starting from 0.  If not, we create dummy data
    # structures to make the interface the same elsewhere in the data
    if trim:
        print('removing pendants...')
        non_pendants = set(v for v, N in neighbors.items() if len(N) > 1 or
                           mean_abunds[v] > trim_cutoff)
        contract_degree_two(non_pendants, neighbors, sequences, mean_abunds,
                            sizes, k)
    else:
        non_pendants = list(neighbors.keys())
    aliases = {x: i for i, x in enumerate(sorted(non_pendants))}
    n = len(aliases)

    # write out sequences & compute offsets
    offsets = {}
    kv_list = sorted(aliases.items(), key=lambda x:x[1])
    for x, i in kv_list:
        offsets[x] = contigsfp.tell()
        contigsfp.write('>{}\n{}\n'.format(i, sequences[x]))
        out_mh.add_sequence(sequences[x])
    contigsfp.close()

    print('... done! {} unitigs'.format(n))

    # start the gxt file by writing the number of nodes (unitigs))
    gxtfp.write('{}\n'.format(n))

    # write out all of the links, in 'from to' format.
    n_edges = 0
    for v, N in sorted(neighbors.items()):
        for u in sorted(N):
            gxtfp.write('{} {}\n'.format(aliases[v], aliases[u]))
            n_edges += 1

    print('{} vertices, {} edges'.format(n, n_edges))

    info_fp.write('contig_id,offset,mean_abund,n_kmers\n')
    for v, i in aliases.items():
        info_fp.write('{},{},{:.3f},{}\n'.format(i, offsets[v],
                                                 mean_abunds[v],
                                                 sizes[v]))

    # output two sourmash signatures: one for input contigs, one for
    # output contigs.
    in_sig = sourmash.SourmashSignature(in_mh, filename=args.bcalm_unitigs)
    sourmash.save_signatures([ in_sig ],
                             open(args.bcalm_unitigs + '.sig', 'wt'))

    out_sig = sourmash.SourmashSignature(out_mh, filename=args.contigs_out)
    sourmash.save_signatures([ out_sig ],
                             open(args.contigs_out + '.sig', 'wt'))
Exemple #5
0
record = iter(screed.open(sys.argv[1])).next()
genome = record.sequence
len_genome = len(genome)

n_reads = int(len_genome*COVERAGE / float(READLEN))
reads_mut = 0
total_mut = 0

for i in range(n_reads):
    start = random.randint(0, len_genome - READLEN)
    read = genome[start:start + READLEN].upper()

    # reverse complement?
    if random.choice([0, 1]) == 0:
        read = screed.rc(read)

    # error?
    was_mut = False
    for _ in range(READLEN):
        while random.randint(1, ERROR_RATE) == 1:
           pos = random.randint(1, READLEN) - 1
           read = read[:pos] + random.choice(['a', 'c', 'g', 't']) + read[pos+1:]
           was_mut = True
           total_mut += 1

    if was_mut:
        reads_mut += 1
    
    print '>read%d\n%s' % (i, read)
Exemple #6
0
def main(argv):
    parser = argparse.ArgumentParser()
    parser.add_argument('bcalm_unitigs')
    parser.add_argument('gxt_out')
    parser.add_argument('contigs_out')
    parser.add_argument('-k', '--ksize', type=int, default=31)
    parser.add_argument('-d', '--debug', action='store_true')
    parser.add_argument('-P',
                        '--pendants',
                        action="store_true",
                        help="don't remove low abundance pendants")
    parser.add_argument('-a',
                        '--abundance',
                        nargs='?',
                        type=float,
                        default=1.1)
    parser.add_argument('--randomize', help='randomize cDBG order')
    args = parser.parse_args(argv)

    k = args.ksize

    trim = not args.pendants
    trim_cutoff = args.abundance
    unitigs = args.bcalm_unitigs
    debug = args.debug

    if args.debug:
        logging.basicConfig(filename='bcalm_to_gxt.log',
                            filemode='w',
                            level=logging.DEBUG)
    else:
        logging.basicConfig(filename='bcalm_to_gxt.log',
                            filemode='w',
                            level=logging.WARNING)

    logging.debug("starting bcalm_to_gxt run.")

    gxtfp = open(args.gxt_out, 'wt')
    contigsfp = bgzf.open(args.contigs_out, 'wb')
    info_filename = args.contigs_out + '.info.csv'
    info_fp = open(info_filename, 'wt')
    in_mh = sourmash.MinHash(0, 31, scaled=1000)
    out_mh = sourmash.MinHash(0, 31, scaled=1000)

    # load in the basic graph structure from the BCALM output file
    neighbors, sequences, mean_abunds, sizes = read_bcalm(unitigs, debug, k)

    # record input k-mers in a minhash
    for seq in sequences.values():
        in_mh.add_sequence(seq)

    # make order deterministic by reordering around min value of first, last,
    # and reverse complementing sequences appropriately
    print('reordering...')
    reordering = {}

    # first, put sequences in specific orientation
    sequence_list = []
    for key in neighbors:
        v = sequences[key]

        # pick lexicographically smaller of forward & reverse complement.
        v2 = screed.rc(v)
        if v > v2:
            v = v2
        sequence_list.append((v, key))
        del sequences[key]

    # sort all sequences:
    sequence_list.sort(reverse=True)
    if args.randomize:
        print('(!! randomizing order per --randomize !!)')
        random.shuffle(sequence_list)

    # ok, now remap all the things.
    remapping = {}
    new_sequences = {}

    # remap sequences
    new_key = 0
    while sequence_list:  # consume while iterating
        sequence, old_key = sequence_list.pop()
        remapping[old_key] = new_key
        new_sequences[new_key] = sequence
        new_key += 1

    # remap other things
    new_neighbors = collections.defaultdict(set)
    for old_key, vv in neighbors.items():
        new_vv = [remapping[v] for v in vv]
        new_neighbors[remapping[old_key]] = set(new_vv)

    new_mean_abunds = {}
    for old_key, value in mean_abunds.items():
        new_mean_abunds[remapping[old_key]] = value

    new_sizes = {}
    for old_key, value in sizes.items():
        new_sizes[remapping[old_key]] = value

    assert len(sequences) == 0
    print('...done')

    sequences = new_sequences
    mean_abunds = new_mean_abunds
    sizes = new_sizes
    neighbors = new_neighbors

    # if we are removing pendants, we need to relabel the contigs so they are
    # consecutive integers starting from 0.  If not, we create dummy data
    # structures to make the interface the same elsewhere in the data
    if trim:
        print('removing pendants...')
        non_pendants = set(v for v, N in neighbors.items()
                           if len(N) > 1 or mean_abunds[v] > trim_cutoff)
        contract_degree_two(non_pendants, neighbors, sequences, mean_abunds,
                            sizes, k)
    else:
        non_pendants = list(neighbors.keys())
    aliases = {x: i for i, x in enumerate(sorted(non_pendants))}
    n = len(aliases)

    # write out sequences & compute offsets
    offsets = {}
    kv_list = sorted(aliases.items(), key=lambda x: x[1])
    for x, i in kv_list:
        offsets[x] = contigsfp.tell()
        contigsfp.write('>{}\n{}\n'.format(i, sequences[x]))
        out_mh.add_sequence(sequences[x])
    contigsfp.close()

    print('... done! {} unitigs'.format(n))

    # start the gxt file by writing the number of nodes (unitigs))
    gxtfp.write('{}\n'.format(n))

    # write out all of the links, in 'from to' format.
    n_edges = 0
    for v, N in sorted(neighbors.items()):
        for u in sorted(N):
            gxtfp.write('{} {}\n'.format(aliases[v], aliases[u]))
            n_edges += 1

    print('{} vertices, {} edges'.format(n, n_edges))

    info_fp.write('contig_id,offset,mean_abund,n_kmers\n')
    for v, i in aliases.items():
        info_fp.write('{},{},{:.3f},{}\n'.format(i, offsets[v], mean_abunds[v],
                                                 sizes[v]))

    # output two sourmash signatures: one for input contigs, one for
    # output contigs.
    in_sig = sourmash.SourmashSignature(in_mh, filename=args.bcalm_unitigs)
    sourmash.save_signatures([in_sig], open(args.bcalm_unitigs + '.sig', 'wt'))

    out_sig = sourmash.SourmashSignature(out_mh, filename=args.contigs_out)
    sourmash.save_signatures([out_sig], open(args.contigs_out + '.sig', 'wt'))
Exemple #7
0
def _equals_rc(query, match):
    return (query == match) or (screed.rc(query) == match)
n_reads = N_READS
reads_mut = 0
total_mut = 0

z = []
for i in range(n_reads):
    index = random.choice(indices)
    sequence = seqs[index]

    start = random.randint(0, len(sequence) - READLEN)
    read = sequence[start:start + READLEN].upper()

    # reverse complement?
    if random.choice([0, 1]) == 0:
        read = screed.rc(read)

    # error?
    was_mut = False
    for _ in range(READLEN):
        while random.randint(1, ERROR_RATE) == 1:
            pos = random.randint(1, READLEN) - 1
            read = read[:pos] + random.choice(['a', 'c', 'g', 't'
                                               ]) + read[pos + 1:]
            was_mut = True
            total_mut += 1

    if was_mut:
        reads_mut += 1

    print '>read%d\n%s' % (i, read)
Exemple #9
0
def main ():
    random.seed(1)                  # make this reproducible.

    infname = "NCDScomplete_p_10gxG.fasta" #infname = sys.argv[1]
    outname = "NCDScomplete_p_10gxG_10X" #outname = sys.argv[2]

    outfile='%s.fasta' % outname
    outf=open(outfile,"w+")
    logfile='%s.log' % outname
    outlog=open(logfile,"w+")

    COVERAGE=10
    READLEN=100
    ERROR_RATE=100

    #record = iter(screed.open(sys.argv[1])).next()
    #record = iter(screed.open('NCDScomplete_p.fasta')).next()
    record = screed.open(infname)
    id = 0
    total_reads = 0
    total_mutated = 0
    total_mutations = 0

    for g in record:
        name = "n%s" % g.name # this is for noise # g.name 
        genome = g.sequence
        len_genome = len(genome)
    
        n_reads = int(len_genome*COVERAGE / float(READLEN))
        reads_mut = 0
        total_mut = 0
    
        for i in range(n_reads):
            if len_genome < 100:
                pass
            else:
                start = random.randint(0, len_genome - READLEN)
                read = genome[start:start + READLEN].upper()

    # reverse complement?
            if random.choice([0, 1]) == 0:
                read = screed.rc(read)

    # error?
            was_mut = False
            for _ in range(READLEN):
                while random.randint(1, ERROR_RATE) == 1:
                    pos = random.randint(1, READLEN) - 1
                    read = read[:pos] + random.choice(['a', 'c', 'g', 't']) + read[pos+1:]
                    was_mut = True
                    total_mut += 1

            if was_mut:
                reads_mut += 1
            
            outf.write('>n%dread%d\n%s\n' % (id,i, read))

        #print >>sys.stderr, "%d of %d reads mutated; %d total mutations from sequence %s" % \
        #(reads_mut, n_reads, total_mut, name)
        outlog.write("%d of %d reads mutated; %d total mutations from sequence %d => %s\n" % \
                   (reads_mut, n_reads, total_mut, id, name))
        total_reads += n_reads
        total_mutated += reads_mut
        total_mutations += total_mut
        id += 1

    outlog.write("TOTAL: %d of %d reads mutated; %d total mutations" % \
                 (total_mutated, total_reads, total_mutations))
    print >>sys.stderr, "Done!!"