Beispiel #1
0
def main():
    #pdb.set_trace()
    args = get_args()
    names = {}
    temp1 = "{}.temp1".format(args.infile)
    temp2 = "{}.temp2".format(args.infile)
    outf = fasta.FastaWriter(temp1)
    mask_file = os.path.splitext(args.infile)[0] + ".fa.out"
    f = fasta.FastaReader(args.infile)
    for seq in f:
        print seq.identifier
        gb = seq.identifier.split('|')[3]
        newname = seq.identifier.split(',')[0].split(' ')[-1]
        names[gb] = newname
        seq.identifier = ">{}".format(gb)
        outf.write(seq)
    outf.close()
    cmd = ["maskOutFa", "-softAdd", temp1, mask_file, temp2]
    subprocess.Popen(cmd).wait()
    final = "{}.masked".format(args.infile)
    outf = fasta.FastaWriter(final)
    for seq in fasta.FastaReader(temp2):
        iden = seq.identifier.strip('>')
        seq.identifier = "{}".format(names[iden])
        print seq.identifier
        outf.write(seq)
    outf.close()
Beispiel #2
0
 def setUp(self):
     # switch to this directory - so we can have access to data
     try:
         os.chdir(os.path.dirname(os.path.abspath( __file__ )))
         seq = 'test-data/sequence.fasta'
         self.fasta = fasta.FastaReader(seq)
     except OSError:
         seq = 'test-data/sequence.fasta'
         self.fasta = fasta.FastaReader(seq)
def main():
    args = get_args()
    conn = sqlite3.connect(args.db)
    c = conn.cursor()
    config = ConfigParser.RawConfigParser(allow_no_value=True)
    config.read(args.config)
    organisms = get_names_from_config(config, args.group)
    excludes = get_names_from_config(config, 'Excludes')
    if excludes:
        organisms = [org for org in organisms if org not in excludes]
    args.output.write("org\tcontigs\tavg len\n")
    for org in organisms:
        # skip extended data, which are typically from genome-enabled orgs,
        # not capture data
        if not org.endswith('*'):
            # get the uce-matching node names from the db
            matching_nodes = get_matching_node_names(c, org)
            # parse the contig file for the organism, and return contig
            # lengths
            f = os.path.join(args.fasta, "{0}.{1}".format(org.replace('_','-'),'contigs.fasta'))
            records = fasta.FastaReader(f)
            contig_lens = [len(seq) for seq in records 
                if '_'.join(seq.identifier.strip('>').split('_')[0:2]) in matching_nodes]
            # write the average contig length of contigs matching UCEs
            args.output.write("{0}\t{1}\t{2}\n".format(org, len(contig_lens), float(sum(contig_lens))/len(contig_lens)))
Beispiel #4
0
def get_fasta_dict(args):
    print 'Building the locus dictionary...'
    if args.ambiguous:
        print 'NOT removing sequences with ambiguous bases...'
    else:
        print 'Removing ALL sequences with ambiguous bases...'
    loci = defaultdict(list)
    for record in fasta.FastaReader(args.infile):
        #pdb.set_trace()
        if not args.faircloth:
            locus = record.identifier.split('|')[1]
        else:
            locus = '_'.join([record.identifier.split('|')[0], \
                record.identifier.split('|')[1].split('_')[0]])
        loci = build_locus_dict(loci, locus, record, args.ambiguous)
    # workon a copy so we can iterate and delete
    snapshot = copy.deepcopy(loci)
    # iterate over loci to check for all species at a locus
    for locus, data in snapshot.iteritems():
        if args.notstrict:
            if len(data) < 3:
                t = "\tDropping Locus {0} because it has fewer " + \
                        "than the minimum number " + \
                        "of taxa for alignment (N < 2)"
                print(t).format(locus)
                del loci[locus]
        else:
            if len(data) < args.species:
                del loci[locus]
                t = "\tDropping Locus {0} because it has fewer " + \
                        "than the minimum number " + \
                        "of taxa for alignment (N < 2)"
                print(t).format(locus)
    return loci
Beispiel #5
0
def main():
    args = get_args()
    avg_read_length = get_average_read_length(args.input)
    kmer = raw_input("What was kmer length? ")
    kmer = int(kmer)
    avg_c = []
    for read in fasta.FastaReader(args.input):
        s_read = read.identifier.split('_')
        ck = float(s_read[-1])
        c = ck * avg_read_length / (avg_read_length - kmer + 1)
        avg_c.append(c)
    avg_c = numpy.array(avg_c)
    if not args.csv:
        print "mean:\t", numpy.mean(avg_c)
        print "95ci:\t", 1.96 * (numpy.std(avg_c, ddof=1) /
                                 math.sqrt(len(avg_c)))
        print "min:\t", min(avg_c)
        print "max:\t", max(avg_c)
        print "median:\t", numpy.median(avg_c)
        print "<10x:\t", sum(avg_c < 10)
        print "<25x:\t", sum(avg_c < 25)
        print "<50x:\t", sum(avg_c < 50)
        print "<100x:\t", sum(avg_c < 100)
    else:
        print "{0},{1},{2},{3},{4},{5},{6},{7},{8}".format(
            numpy.mean(avg_c),
            1.96 * (numpy.std(avg_c, ddof=1) / math.sqrt(len(avg_c))),
            min(avg_c), max(avg_c), numpy.median(avg_c), sum(avg_c < 10),
            sum(avg_c < 25), sum(avg_c < 50), sum(avg_c < 100))
def main():
    args = get_args()
    uce_loci = []
    # get lengths of loci
    seq_lengths = {}
    for seq in fasta.FastaReader(args.fasta):
        name = seq.identifier.split('|')[1]
        uce_loci.append(name)
        seq_lengths[name] = len(seq.sequence)
    overlappers = defaultdict(dict)
    names = defaultdict(list)
    coords = {}
    for match in lastz.Reader(args.lastz, long_format=True):
        locus = match.name2.split('|')[1]
        chromo = match.name1
        coords[locus] = (match.zstart1, match.end1)
        for pmatch, span in overlappers[chromo].iteritems():
            if locus == 'chr5_10696_s' and pmatch == 'chr13_710_s':
                pdb.set_trace()
            overlap = span.find(match.zstart1, match.end1)
            if overlap:
                overlappers[chromo][pmatch].insert(match.zstart1, match.end1,
                                                   locus)
                names[pmatch].append(locus)
                break
        else:
            overlappers = add_new_locus(match, overlappers, chromo)
    overlapping_loci = []
    all_groups = []
    for k, v in names.iteritems():
        # group loci into overlapping clusters
        base = [k]
        base.extend(v)
        all_groups.append(base)
        # get list of "bad loci" so we can determine non-overlappers
        overlapping_loci.append(k)
        overlapping_loci.extend(v)
    pdb.set_trace()
    non_overlapping_loci = set(uce_loci).difference(set(overlapping_loci))
    # generate output in config-file format:
    config = ConfigParser.RawConfigParser()
    config.add_section('Non-overlapping loci')
    for locus in list(non_overlapping_loci):
        config.set('Non-overlapping loci', locus, seq_lengths[locus])
    longest_of_overlapping = get_longest_of_overlapping_loci(
        all_groups, seq_lengths)
    config.add_section('Longest loci of group')
    for locus in longest_of_overlapping:
        config.set('Longest loci of group', locus, seq_lengths[locus])
    config.add_section('Superlocus groups')
    for c, group in enumerate(all_groups):
        # order loci by start position
        starts = [(name, coords[name][0], coords[name][1]) for name in group]
        starts = sorted(starts, key=itemgetter(1))
        sorted_names = [n[0] for n in starts]
        print starts
        #pdb.set_trace()
        config.set('Superlocus groups', "Group{0}".format(c),
                   ','.join(sorted_names))
    config.write(args.output)
Beispiel #7
0
def get_fasta_dict(args):
    if args.verbose:
        sys.stdout.write('Building the locus dictionary...\n')
        if args.ambiguous:
            sys.stdout.write(
                'NOT removing sequences with ambiguous bases...\n')
        else:
            sys.stdout.write(
                'Removing ALL sequences with ambiguous bases...\n')
    sys.stdout.flush()
    loci = defaultdict(list)
    if os.path.isfile(args.infile):
        for record in fasta.FastaReader(args.infile):
            if not args.faircloth:
                locus = record.identifier.split('|')[1]
            else:
                locus = '_'.join([record.identifier.split('|')[0], \
                    record.identifier.split('|')[1].split('_')[0]])
            loci = build_locus_dict(loci, locus, record, args.ambiguous)
    # work with a directory of fastas if we have those - get locus name from
    # filename
    elif os.path.isdir(args.infile):
        for ff in glob.glob(os.path.join(args.infile, '*.fa*')):
            locus = os.path.splitext(os.path.basename(ff))[0]
            for record in fasta.FastaReader(ff):
                loci = build_locus_dict(loci, locus, record, args.ambiguous)
    # workon a copy so we can iterate and delete
    snapshot = copy.deepcopy(loci)
    # iterate over loci to check for all species at a locus
    for locus, data in snapshot.iteritems():
        if args.notstrict:
            if len(data) < 3:
                t = "\tDropping Locus {0} because it has fewer " + \
                        "than the minimum number " + \
                        "of taxa for alignment (N < 2)\n"
                sys.stdout.write((t).format(locus))
                sys.stdout.flush()
                del loci[locus]
        else:
            if len(data) < args.species:
                del loci[locus]
                t = "\tDropping Locus {0} because it has fewer " + \
                        "than the minimum number " + \
                        "of taxa for alignment (N < 2)\n"
                sys.stdout.write((t).format(locus))
                sys.stdout.flush()
    return loci
def main():
    args = get_args()
    records = fasta.FastaReader(args.fasta)
    lengths = defaultdict(list)
    for sequence in records:
        # BEWARE:  this may cause name clash, which will error out
        org = sequence.identifier.split(' ')[0].split('_')[-2]
        lengths[org].append(len(sequence))
    for org, l in lengths.iteritems():
        #pdb.set_trace()
        args.output.write("{0}\t{1}\n".format(org, float(sum(l)) / len(l)))
Beispiel #9
0
def main():
    args = get_args()
    uces = set([
        get_name(read.identifier, "|", 1)
        for read in fasta.FastaReader(args.query)
    ])
    files = glob.glob(os.path.join(args.lastz, '*.lastz'))
    # this prob. needs to be more robust
    organisms = [
        os.path.splitext(os.path.basename(f).split('-')[-1])[0].replace(
            '-', "_") for f in files
    ]
    conn, c = create_match_database(args.db, organisms, uces)
    if args.dupefile:
        dupes = get_dupes(args.dupefile)
    else:
        dupes = None
    #pdb.set_trace()
    for f in files:
        critter = os.path.splitext(os.path.basename(f).split('-')[-1])[0]
        matches, probes = get_matches(f, args.splitchar, args.components)
        count = 0
        for k, v in matches.iteritems():
            skip = False
            if len(v) > 1:
                if run_checks(k, v, probes, args.verbose):
                    # sort by match position
                    v_sort = sorted(v, key=itemgetter(2))
                    start, end = v_sort[0][2], v_sort[-1][3]
                    diff = end - start
                    # ensure our range is less than N(probes) * probe_length - this
                    # still gives us a little wiggle room because probes are ~ 2X tiled
                    if diff > (probes[k] * 120):
                        skip = True
                        if args.verbose:
                            print "range longer than expected"
                else:
                    skip = True
            elif args.dupefile and k in dupes:
                skip = True
                if args.verbose: print "{0} is in dupefile".format(k)
            else:
                pass
            if not skip:
                store_lastz_results_in_db(c, critter, k)
                count += 1
        print "Entered {} matches for {}".format(count, critter)
    conn.commit()
    c.close()
    conn.close()
Beispiel #10
0
def add_additional_columns(args, conn, c):
    assert args.name is not None, "You need to include --add-name to add a table"
    query = """ALTER TABLE probeset ADD COLUMN {0} int DEFAULT 0""".format(
        args.name)
    c.execute(query)
    for seq in fasta.FastaReader(args.add):
        locus = seq.identifier.lstrip('>').split('|')[0]
        query = """SELECT id, locus, probe, source, sequence, oldprobe
                FROM probes
                WHERE oldprobe LIKE '%{0}%'""".format(locus)
        c.execute(query)
        rows = c.fetchall()
        hit = False
        for row in rows:
            idx, locus, probe, source, sequence, oldlocus = row
            if seq.sequence in sequence:
                hit = True
                query = """UPDATE probeset set {0} = 1 WHERE id = {1}""".format(
                    args.name, idx)
                c.execute(query)
        if not hit:
            print "Miss: {0}".format(seq.identifier)
def main():
    args = get_args()
    config = ConfigParser.RawConfigParser(allow_no_value=True)
    config.read(args.config)
    conn = sqlite3.connect(args.db)
    c = conn.cursor()
    if args.extend_db:
        query = "ATTACH DATABASE '{0}' AS extended".format(args.extend_db)
        c.execute(query)
    organisms = get_names_from_config(config, 'Organisms')
    uces = get_names_from_config(config, 'Loci')
    #pdb.set_trace()
    uce_fasta_out = fasta.FastaWriter(args.output)
    regex = re.compile("[N,n]{1,21}")
    for organism in organisms:
        print "Getting {0} reads...".format(organism)
        written = []
        # going to need to do something more generic w/ suffixes
        #pdb.set_trace()
        name = organism.replace('_', '-')
        if args.notstrict:
            if not organism.endswith('*'):
                reads = find_file(args.contigs, name)
                node_dict, missing = get_nodes_for_uces(c, organism, uces, extend=False, notstrict=True)
            elif args.extend_dir:
                # remove the asterisk
                name = name.rstrip('*')
                reads = find_file(args.extend_dir, name)
                node_dict, missing = get_nodes_for_uces(c, organism.rstrip('*'), uces, extend=True, notstrict=True)
        else:
            if not name.endswith('*'):
                reads = find_file(args.contigs, name)
                node_dict, missing = get_nodes_for_uces(c, organism, uces)
            elif name.endswith('*') and args.extend_dir:
                # remove the asterisk
                name = name.rstrip('*')
                reads = find_file(args.extend_dir, name)
                node_dict, missing = get_nodes_for_uces(c, organism.rstrip('*'), uces, extend=True)
        for read in fasta.FastaReader(reads):
            name = get_name(read.identifier).lower()
            coverage = get_coverage(read.identifier)
            if name in node_dict.keys():
                uce_seq = fasta.FastaSequence()
                uce_seq.identifier = ">{0}_{1} |{0}|{2}".format(node_dict[name][0], organism, coverage)
                # deal with strandedness because aligners dont, which
                # is annoying
                if node_dict[name][1] == '-':
                    uce_seq.sequence = transform.DNA_reverse_complement(read.sequence)
                else:
                    uce_seq.sequence = read.sequence
                # replace any occurrences of <21 Ns
                if regex.search(uce_seq.sequence):
                    uce_seq.sequence = re.sub(regex, "", uce_seq.sequence)
                    print "\tReplaced < 20 ambiguous bases in {0}".format(uce_seq.identifier.split(' ')[0])
                uce_fasta_out.write(uce_seq)
                written.append(str(node_dict[name][0]))
            else:
                pass
        #pdb.set_trace()
        if args.notstrict and missing:
            args.notstrict.write("[{0}]\n".format(organism))
            for name in missing:
                args.notstrict.write("{0}\n".format(name))
                written.append(name)
        assert set(written) == set(uces), "UCE names do not match"
        #assert set(written) == set(uces), pdb.set_trace()
    uce_fasta_out.close()
def main():
    uces = []
    # get all ids of probes in 2560 set
    for seq in fasta.FastaReader('../archive/probe-subset-2560-synthesized.fasta'):
        name_split = seq.identifier.split('_')
        if name_split[0] not in ['>chrE22C19W28','>chrUn']:
            iden = '_'.join(name_split[:2]).strip('>')
        else:
            iden = '_'.join(name_split[:3]).strip('>')
        uces.append(iden)
    # get names, lengths, and GC content of loci in dbase
    conn = sqlite3.connect('/Users/bcf/Git/brant/seqcap/Non-repo/probe.sqlite')
    cur = conn.cursor()
    metadata = defaultdict(dict)
    for uce in uces:
        cur.execute("SELECT cons, cons_len FROM cons WHERE seq = ?", (uce,))
        data = cur.fetchall()
        # ensure we only get one record back
        assert len(data) == 1, "More than one record"
        read, length = data[0]
        gc = round((read.count('C') + read.count('G')) / float(len(read)), 3)
        cur.execute('''SELECT count(*) FROM sureselect WHERE seq = ? AND
                selected = 1''', (uce,))
        data = cur.fetchall()
        assert len(data) == 1, "More than one record"
        count = data[0][0]
        if count > 1:
            cur.execute('''SELECT avg(tm), avg(masked_bases),
            avg(added_bases) from sureselect where seq = ? 
            group by seq''', (uce,))
        else:
            cur.execute('''SELECT tm, masked_bases,
            added_bases from sureselect where seq = ? 
            group by seq''', (uce,))
        tm, masked, added = cur.fetchall()[0]
        metadata[uce] = {
            'gc':gc,
            'length':length,
            'count':count,
            'tm':tm,
            'masked':masked,
            'added':added
            }
    cur.close()
    conn.close()
    conn = sqlite3.connect('../archive/birds-probe-matches.sqlite')
    cur = conn.cursor()
    taxa = [
            'anser_erythropus',
            'gallus_gallus',
            'pitta_guajana',
            'dromaius_novaehollandiae',
            'megalaima_virens',
            'struthio_camelus',
            'eudromia_elegans',
            'phalacrocorax_carbo',
            'urocolius_indicus',
        ]
    query = "SELECT {} FROM matches WHERE uce = ?".format(', '.join(taxa))
    for uce in metadata.keys():
        cur.execute(query, (uce.lower(),))
        data = cur.fetchall()
        for k,v in enumerate(data[0]):
            metadata[uce][taxa[k]] = v
        #pdb.set_trace()
    outfile = open('gc-length-species-matches.csv', 'w')
    outfile.write('uce,gc,length,count,tm,masked,added,present,taxon\n')
    for uce in sorted(metadata.keys()):
        for taxon in taxa:
            outfile.write('{},{},{},{},{},{},{},{},{}\n'.format(
                uce,
                metadata[uce]['gc'],
                metadata[uce]['length'],
                metadata[uce]['count'],
                metadata[uce]['tm'],
                metadata[uce]['masked'],
                metadata[uce]['added'],
                metadata[uce][taxon],
                taxon.replace('_',' ').capitalize()
            ))
    outfile.close()
Beispiel #13
0
def main():
    args = get_args()
    if args.regex and args.repl is not None:
        # "s_[0-9]+$"
        regex = re.compile(args.regex)
        uces = set([get_name(read.identifier, "|", 1, regex=regex, repl=args.repl)
            for read in fasta.FastaReader(args.query)])
    else:
        uces = set([get_name(read.identifier, "|", 1)
            for read in fasta.FastaReader(args.query)])
        regex = None
    if args.dupefile:
        print "\t Getting dupes"
        dupes = get_dupes(args.dupefile, regex, args.repl)
    contig = args.contigs#glob.glob(os.path.join(args.contigs, '*.fa*'))
    organisms = ["contigs"]#get_organism_names_from_fasta_files(fasta_files)
    conn, c = create_probe_database( uces )
    print "Processing:"
    #for contig in fasta_files:
    critter = os.path.basename(contig).split('.')[0].replace('-', "_")
    #output = args.align 
    # os.path.join(
    #         args.align, \
    #         os.path.splitext(os.path.basename(contig))[0] + '.lastz'
    #      )
    contigs = contig_count(contig)
    # align the probes to the contigs
    alignment = lastz.Align(
              contig,
              args.query,
              args.coverage,
              args.identity,
              args.align 
            )
    lzstdout, lztstderr = alignment.run()
    # parse the lastz results of the alignment
    matches, orientation, revmatches = \
                defaultdict(set), defaultdict(set), defaultdict(set)
    probe_dupes = set()
    if not lztstderr:
        for lz in lastz.Reader(args.align ):
            # get strandedness of match
            contig_name = get_name(lz.name1)
            uce_name = get_name(lz.name2, "|", 1, regex=regex, repl=args.repl)
            if args.dupefile and uce_name in dupes:
                probe_dupes.add(uce_name)
            else:
                matches[contig_name].add(uce_name)
                orientation[uce_name].add(lz.strand2)
                revmatches[uce_name].add(contig_name)
    else:
        print "Error in lastz:"
        print "STDerr:"
        print lztstderr
        print "STDout:"
        print lzstdout

    # we need to check nodes for dupe matches to the same probes
    contigs_matching_mult_uces = check_contigs_for_dupes(matches)
    uces_matching_mult_contigs = check_probes_for_dupes(revmatches)
    nodes_to_drop = contigs_matching_mult_uces
    nodes_to_drop_one_of = uces_matching_mult_contigs
    # remove dupe and/or dubious nodes/contigs
    match_copy = copy.deepcopy(matches)
    already_observed = list()
    for k in match_copy.keys():
        if k in nodes_to_drop:
            del matches[k]
        elif k in nodes_to_drop_one_of:
        	if matches[k] in already_observed:
        		del matches[k]
        	else:
        		already_observed.append(matches[k])
    store_lastz_results_in_db(c, matches, orientation, critter)
    conn.commit()
    pretty_print_output(
                critter,
                matches,
                contigs,
                probe_dupes,
                contigs_matching_mult_uces,
                uces_matching_mult_contigs
            )
    # get all the UCE records from the db
    query = "SELECT uce, {0} FROM match_map WHERE {0} IS NOT NULL".format("contigs")
    c.execute(query)
    data = {row[1].split("(")[0]:row[0] for row in c.fetchall()}
    nodenames = set(data.keys())
    # make sure we don't lose any dupes
    assert len(data) == len(nodenames), "There were duplicate contigs."
    outp = open(args.output, 'w')
    print "Building UCE fasta:"
    #for contig in fasta_files:
    for record in SeqIO.parse(open(contig), 'fasta'):
        name = '_'.join(record.id.split('_')[:2])
        if name.lower() in nodenames:
            record.id = "{0}|{1}".format(data[name.lower()], record.id)
            outp.write(record.format('fasta'))
    outp.close()
def main():
    args = get_args()
    # compile some regular expressions we'll use later
    stripnum = re.compile("s_[0-9]+$")
    manyn = re.compile("[N,n]{20,}")
    # get names of loci and taxa
    uces = get_uce_names_from_probes(args.probes)
    taxa = get_taxa_names_from_fastas(args.fasta)
    print "\n"
    if not args.extend:
        if args.db is None:
            db = os.path.join(args.output, 'probe.matches.sqlite')
        else:
            db = args.db
        # create db to hold results
        conn, c = create_probe_database(
                db,
                taxa,
                uces,
                True
            )
    else:
        conn, c = extend_probe_database(
                args.db,
                taxa
            )
    # get duplicate probe sequences for filtering
    if args.dupefile:
        print "Determining duplicate probes..."
        dupes = get_dupes(args.dupefile, longfile=False)
    else:
        dupes = None
    # iterate over LASTZ files for each taxon
    for lz in glob.glob(os.path.join(args.lastz, '*')):
        # get fasta name from lastz file
        ff = get_fasta_name_from_lastz_pth(lz, args.fasta, args.pattern)
        # get taxon name from lastz file
        taxon = get_taxon_from_filename(ff)
        print "\n{0}\n{1}\n{0}".format('=' * 30, taxon)
        # get lastz matches
        print "\tGetting LASTZ matches from GENOME alignments..."
        matches, probes = get_matches(lz)
        # remove bad loci (dupes)
        print "\tGetting bad (potentially duplicate) GENOME matches..."
        loci_to_skip = []
        for k, v in matches.iteritems():
            # check matches to makes sure all is well - keep names lc
            loci_to_skip.extend(quality_control_matches(matches, probes, dupes, k, v, False))
        #pdb.set_trace()
        # convert to set, to keep only uniques
        loci_to_skip = set(loci_to_skip)
        print "\tSkipping {} bad (duplicate hit) loci...".format(len(loci_to_skip))
        # get (and possibly assemble) non-skipped
        seqdict = defaultdict(list)
        # determine those contigs to skip and group those to assemble
        for contig in fasta.FastaReader(ff):
            # make sure all names are lowercase
            contig.identifier = contig.identifier.lower()
            name = contig.identifier.split('|')[-4].strip()
            locus = name.split('_')[0]
            # skip what we identified as bad loci
            if locus not in loci_to_skip:
                seqdict[locus].append(contig)
        output_name = "{}.fasta".format(taxon.replace('_', '-'))
        fout_name = os.path.join(args.output, output_name)
        print "\tOutput filename is {}".format(output_name)
        fout = fasta.FastaWriter(fout_name)
        # this tracks "fake" contig number
        count = 0
        # this tracks loci kept
        kept = 0
        # when > 1 contig, assemble contigs across matches
        sys.stdout.write("\tWriting and Aligning/Assembling UCE loci with multiple probes (dot/1000 loci)")
        for k, v in seqdict.iteritems():
            bad = False
            contig_names = []
            if count % 1000 == 0:
                sys.stdout.write('.')
                sys.stdout.flush()
            if len(v) == 1:
                # trim ambiguous bases on flanks
                record = v[0]
                orient = [matches[k][0][1]]
                if args.flank:
                    record = trim_uce_reads(record, args.flank)
                contig_names.append(record.identifier)
                record.sequence = record.sequence.strip('N')
                # trim many ambiguous bases within contig
                result = manyn.search(record.sequence)
                if result:
                    uce_start, uce_end = get_probe_positions(record)
                    uce = record.sequence[uce_start:uce_end]
                    record.sequence = snip_if_many_N_bases(manyn, k, record.sequence, uce, verbose=False)
                # change header
                record.identifier = ">Node_{0}_length_{1}_cov_1000".format(
                        count,
                        len(record.sequence)
                    )
                fout.write(v[0])
            else:
                orient = list(set([m[1] for m in matches[k]]))
                # skip any loci having matches of mixed orientation
                # ['+', '-']
                if len(orient) == 1:
                    # create tempfile for the reads
                    fd, temp = tempfile.mkstemp(suffix='.fasta')
                    os.close(fd)
                    temp_out = fasta.FastaWriter(temp)
                    # write all slices to outfile, trimming if we want
                    #pdb.set_trace()
                    for record in v:
                        if args.flank:
                            record = trim_uce_reads(record, args.flank)
                        # keep names of contigs we assembled to store in db assoc
                        # w/ resulting assembled contig name
                        contig_names.append(record.identifier)
                        record.sequence = record.sequence.strip('N')
                        # trim many ambiguous bases within contig
                        result = manyn.search(record.sequence)
                        if result:
                            uce_start, uce_end = get_probe_positions(record)
                            uce = record.sequence[uce_start:uce_end]
                            record.sequence = snip_if_many_N_bases(manyn, k, record.sequence, uce, verbose=False)
                        temp_out.write(record)
                    # make sure to close the file
                    temp_out.close()
                    # assemble
                    aln = Align(temp)
                    aln.run_alignment()
                    record = fasta.FastaSequence()
                    record.sequence = aln.alignment_consensus.tostring()
                    record.identifier = ">Node_{0}_length_{1}_cov_1000".format(
                            count,
                            len(record.sequence)
                        )
                    fout.write(record)
                else:
                    bad = True
            if not bad:
                # track contig assembly and renaming data in db
                q = "UPDATE matches SET {0} = 1 WHERE uce = '{1}'".format(taxon, k)
                c.execute(q)
                # generate db match and match map tables for data
                orient_key = "node_{0}({1})".format(count, orient[0])
                q = "UPDATE match_map SET {0} = '{1}' WHERE uce = '{2}'".format(taxon, orient_key, k)
                c.execute(q)
                # keep track of new name :: old name mapping
                for old_name in contig_names:
                    q = "INSERT INTO contig_map VALUES ('{0}', '{1}', '{2}', '{3}')".format(taxon, k, old_name, record.identifier)
                    c.execute(q)
                kept += 1
            # tracking "fake" contig number
            count += 1
        conn.commit()
        print "\n\t{0} loci of {1} matched ({2:.0f}%), {3} dupes dropped ({4:.0f}%), {5} ({6:.0f}%) kept".format(
            count,
            len(uces),
            float(count) / len(uces) * 100,
            len(loci_to_skip),
            float(len(loci_to_skip)) / len(uces) * 100,
            kept,
            float(kept) / len(uces) * 100
            )
    #conn.commit()
    c.close()
    conn.close()
Beispiel #15
0
def main():
    args = get_args()
    if args.regex and args.repl is not None:
        # "s_[0-9]+$"
        regex = re.compile(args.regex)
        uces = set([
            get_name(read.identifier, "|", 1, regex=regex, repl=args.repl)
            for read in fasta.FastaReader(args.query)
        ])
    else:
        uces = set([
            get_name(read.identifier, "|", 1)
            for read in fasta.FastaReader(args.query)
        ])
        regex = None
    if args.dupefile:
        print "\t Getting dupes"
        dupes = get_dupes(args.dupefile, regex, args.repl)
    fasta_files = glob.glob(os.path.join(args.contigs, '*.fa*'))
    organisms = get_organism_names_from_fasta_files(fasta_files)
    conn, c = create_probe_database(
        os.path.join(args.output, 'probe.matches.sqlite'), organisms, uces)
    print "Processing:"
    for contig in fasta_files:
        critter = os.path.basename(contig).split('.')[0].replace('-', "_")
        output = os.path.join(
                    args.output, \
                    os.path.splitext(os.path.basename(contig))[0] + '.lastz'
                )
        contigs = contig_count(contig)
        # align the probes to the contigs
        alignment = lastz.Align(contig, args.query, args.coverage,
                                args.identity, output)
        lzstdout, lztstderr = alignment.run()
        # parse the lastz results of the alignment
        matches, orientation, revmatches = \
                defaultdict(set), defaultdict(set), defaultdict(set)
        probe_dupes = set()
        if not lztstderr:
            for lz in lastz.Reader(output):
                # get strandedness of match
                contig_name = get_name(lz.name1)
                uce_name = get_name(lz.name2,
                                    "|",
                                    1,
                                    regex=regex,
                                    repl=args.repl)
                if args.dupefile and uce_name in dupes:
                    probe_dupes.add(uce_name)
                else:
                    matches[contig_name].add(uce_name)
                    orientation[uce_name].add(lz.strand2)
                    revmatches[uce_name].add(contig_name)
        # we need to check nodes for dupe matches to the same probes
        contigs_matching_mult_uces = check_contigs_for_dupes(matches)
        uces_matching_mult_contigs = check_probes_for_dupes(revmatches)
        nodes_to_drop = contigs_matching_mult_uces.union(
            uces_matching_mult_contigs)
        # remove dupe and/or dubious nodes/contigs
        match_copy = copy.deepcopy(matches)
        for k in match_copy.keys():
            if k in nodes_to_drop:
                del matches[k]
        store_lastz_results_in_db(c, matches, orientation, critter)
        conn.commit()
        pretty_print_output(critter, matches, contigs, probe_dupes,
                            contigs_matching_mult_uces,
                            uces_matching_mult_contigs)