def main():
    args = get_args()
    regex = re.compile("[N,n]{20,}")
    if args.dupefile:
        dupes = get_dupes(args.dupefile, longfile=False)
    else:
        dupes = None
    matches, probes = get_matches(args.lastz, args.splitchar, args.components,
                                  args.fish)
    #unique_matches = sum([1 for uce, map_pos in matches.iteritems() if len(map_pos) == probes[uce]])
    if args.fasta:
        tb = bx.seq.twobit.TwoBitFile(file(args.genome))
    count = 0
    for k, v in matches.iteritems():
        chromo, strand, start, end, skip = quality_control_matches(
            matches, probes, dupes, k, v, args.verbose)
        if not skip and args.fasta:
            prep_and_write_fasta(tb, regex, args.fasta, chromo, strand, start,
                                 end, count, args.flank)
        if not skip and args.bed:
            args.bed.write("{0} {1} {2} {3} 1000 {4}\n".format(
                chromo, start - args.flank, end + args.flank, k, strand))
        count += 1
        #pdb.set_trace()
    args.fasta.close()
Ejemplo n.º 2
0
def main():
    args = get_args()
    uces = set([
        get_name(read.identifier, "|", 1)
        for read in fasta.FastaReader(args.query)
    ])
    files = glob.glob(os.path.join(args.lastz, '*.lastz'))
    # this prob. needs to be more robust
    organisms = [
        os.path.splitext(os.path.basename(f).split('-')[-1])[0].replace(
            '-', "_") for f in files
    ]
    conn, c = create_match_database(args.db, organisms, uces)
    if args.dupefile:
        dupes = get_dupes(args.dupefile)
    else:
        dupes = None
    #pdb.set_trace()
    for f in files:
        critter = os.path.splitext(os.path.basename(f).split('-')[-1])[0]
        matches, probes = get_matches(f, args.splitchar, args.components)
        count = 0
        for k, v in matches.iteritems():
            skip = False
            if len(v) > 1:
                if run_checks(k, v, probes, args.verbose):
                    # sort by match position
                    v_sort = sorted(v, key=itemgetter(2))
                    start, end = v_sort[0][2], v_sort[-1][3]
                    diff = end - start
                    # ensure our range is less than N(probes) * probe_length - this
                    # still gives us a little wiggle room because probes are ~ 2X tiled
                    if diff > (probes[k] * 120):
                        skip = True
                        if args.verbose:
                            print "range longer than expected"
                else:
                    skip = True
            elif args.dupefile and k in dupes:
                skip = True
                if args.verbose: print "{0} is in dupefile".format(k)
            else:
                pass
            if not skip:
                store_lastz_results_in_db(c, critter, k)
                count += 1
        print "Entered {} matches for {}".format(count, critter)
    conn.commit()
    c.close()
    conn.close()
Ejemplo n.º 3
0
def check_for_dupes(probe_set):
    """create some temp files and search a newly-designed probe-set for dupes"""
    # write to a tempfile
    f = tempfile.NamedTemporaryFile(mode='w', delete=False)
    for ps in probe_set:
        SeqIO.write(ps, f, 'fasta')
    f.close()
    # align f to itself
    lz = lastz.Align(f.name, f.name, 70, 80)
    lz.run()
    dupes = get_dupes(lz.output, pos=2)
    os.remove(f.name)
    os.remove(lz.output)
    return dupes
def main():
    args = get_args()
    uces = set([get_name(read.identifier, "|", 1) for read in fasta.FastaReader(args.query)])
    files = glob.glob(os.path.join(args.lastz, '*.lastz'))
    # this prob. needs to be more robust
    organisms = [os.path.splitext(os.path.basename(f).split('-')[-1])[0].replace('-',"_") for f in files]
    conn, c = create_match_database(args.db, organisms, uces)
    if args.dupefile:
        dupes = get_dupes(args.dupefile)
    else:
        dupes = None
    #pdb.set_trace()
    for f in files:
        critter = os.path.splitext(os.path.basename(f).split('-')[-1])[0]
        matches, probes = get_matches(f, args.splitchar, args.components)
        count = 0
        for k,v in matches.iteritems():
            skip = False
            if len(v) > 1:
                if run_checks(k, v, probes, args.verbose):
                    # sort by match position
                    v_sort = sorted(v, key = itemgetter(2))
                    start, end = v_sort[0][2], v_sort[-1][3]
                    diff = end - start
                    # ensure our range is less than N(probes) * probe_length - this
                    # still gives us a little wiggle room because probes are ~ 2X tiled
                    if diff > (probes[k] * 120):
                        skip = True
                        if args.verbose:
                            print "range longer than expected"
                else:
                    skip = True
            elif args.dupefile and k in dupes:
                skip = True
                if args.verbose:print "{0} is in dupefile".format(k)
            else:
                pass
            if not skip:
                store_lastz_results_in_db(c, critter, k)
                count += 1
        print "Entered {} matches for {}".format(count, critter)
    conn.commit()
    c.close()
    conn.close()
def main():
    args = get_args()
    regex = re.compile("[N,n]{20,}")
    if args.dupefile:
        dupes = get_dupes(args.dupefile, longfile=False)
    else:
        dupes = None
    matches, probes = get_matches(args.lastz, args.splitchar, args.components, args.fish)
    #unique_matches = sum([1 for uce, map_pos in matches.iteritems() if len(map_pos) == probes[uce]])
    if args.fasta:
        tb = bx.seq.twobit.TwoBitFile(file(args.genome))
    count = 0
    for k, v in matches.iteritems():
        chromo, strand, start, end, skip = quality_control_matches(matches, probes, dupes, k, v, args.verbose)
        if not skip and args.fasta:
            prep_and_write_fasta(tb, regex, args.fasta, chromo, strand, start, end, count, args.flank)
        if not skip and args.bed:
            args.bed.write("{0} {1} {2} {3} 1000 {4}\n".format(chromo, start - args.flank, end + args.flank, k, strand))
        count += 1
        #pdb.set_trace()
    args.fasta.close()
Ejemplo n.º 6
0
def main():
    args = get_args()
    if args.dupefile:
        dupes = get_dupes(args.dupefile)
    else:
        dupes = None
    #pdb.set_trace()
    # get dbSNP data
    all_snps = get_xml_data(args.xml)
    used = set()
    # iterate over intersections
    args.output.write('rsid,pos,maf,1000g\n')
    for row in args.dbsnp:
        if not row.startswith('UCE'):
            uce, chromo, start, end, snp, snps, snpe = row.strip('\n').split(',')
            start, end, snps, snpe = map(int, [start, end, snps, snpe])
            # get relative position
            if not snpe - snps > 1 and snp not in used and not uce in dupes:
                middle = int(round((start + end)/2, 0))
                rel_snp_pos = snps - middle
                # lookup data for snps
                if all_snps[snp.strip('rs')].val_1000G and all_snps[snp.strip('rs')].val_1000G.lower() == 'true':
                    thousandg = True
                else:
                    thousandg = False
                if not all_snps[snp.strip('rs')].freq_freq:
                    freq = 0.0
                else:
                    freq = float(all_snps[snp.strip('rs')].freq_freq)
                args.output.write("{0},{1},{2},{3}\n".format(
                    snp, 
                    rel_snp_pos,
                    freq, 
                    thousandg
                    )
                )
                # make sure we skip any duplicates
                used.add(snp)
Ejemplo n.º 7
0
def main():
    args = get_args()
    print get_dupes(args.lastz)
Ejemplo n.º 8
0
def main():
    args = get_args()
    print get_dupes(args.lastz)
Ejemplo n.º 9
0
def main():
    args = get_args()
    if args.dupefile:
        dupes = get_dupes(args.dupefile)
    else:
        dupes = None
    used = set()
    mx = max([int(row.strip('\n').split(',')[3]) \
            - int(row.strip('\n').split(',')[2]) \
            for row in open(args.dbsnp,'rU') if not row.startswith('UCE')])
    # get the SNP metadata
    all_snps = get_xml_data(args.xml)
    # find the middle
    overall_middle = int(round(mx / 2, 0))
    # list to hold results
    l = numpy.zeros(mx + 1)
    positions = copy.deepcopy(l)
    # create a dict to hold the results by position in longest array
    #differences = dict((d,numpy.array([])) for d in range(-middle, middle + 1))
    # iterate over intersections
    d = {}
    if args.output2:
        args.output2.write(
            'UCE,chromo,uce-start,uce-end,snp-name,snp-start,snp-end,1000gvalidated,freq\n'
        )
    for row in open(args.dbsnp, 'rU'):
        if not row.startswith('UCE'):
            uce, chromo, start, end, snp, snps, snpe = row.strip('\n').split(
                ',')
            start, end, snps, snpe = map(int, [start, end, snps, snpe])
            # get middle of this UCE
            middle = int(round((start + end) / 2, 0))
            #pdb.set_trace()
            if snp not in used:
                if not snpe - snps > 1 \
                    and (uce not in dupes) \
                    and all_snps[snp.strip('rs')].val_1000G == 'true' \
                    and all_snps[snp.strip('rs')].freq_freq is not None:
                    if not uce in d.keys():
                        d[uce] = numpy.zeros(mx + 1)
                    rel_snp_pos = snps - middle
                    d[uce][overall_middle +
                           rel_snp_pos] = all_snps[snp.strip('rs')].freq_freq
                if args.output2 and not snpe - snps > 1 and (
                        snp not in used) and (uce not in dupes):
                    args.output2.write("{},{},{},{},{},{},{},{},{}\n".format(
                        uce, chromo, start, end, snp, snps, snpe,
                        all_snps[snp.strip('rs')].val_1000G,
                        all_snps[snp.strip('rs')].freq_freq))
                used.add(snp)
    stack = numpy.array([d[uce] for uce in d.keys()])
    #pdb.set_trace()
    # compute the running average
    win = 25
    data = sum(stack > 0)
    weightings = numpy.repeat(1.0, win) / win
    running = numpy.convolve(data, weightings)[win - 1:-(win - 1)]
    args.output.write("pos,avg,ci,datatype\n")
    for base in range(len(running)):
        pos = base - overall_middle
        args.output.write("{},{},,running\n".format(pos, running[base]))
    # also output the average heterozygosity of 1000 Genome validated, hetero SNPs.
    for base in range(len(stack[0])):
        pos = base - overall_middle
        values = numpy.where(stack[:, base] != 0)[0]
        # reindex
        avg = numpy.mean(stack[:, base][values])
        ci = 1.96 * (numpy.std(stack[:, base][values], ddof=1) /
                     numpy.sqrt(len(stack[:, base][values])))
        args.output.write("{},{},{},mean_hetero\n".format(pos, avg, ci))
    win = 25
    data = numpy.mean(stack, axis=1)
    weightings = numpy.repeat(1.0, win) / win
    running = numpy.convolve(data, weightings)[win - 1:-(win - 1)]
    for base in range(len(stack[0])):
        pos = base - overall_middle
        args.output.write("{},{},,running_hetero\n".format(pos, running[base]))
Ejemplo n.º 10
0
def main():
    args = get_args()
    # compile some regular expressions we'll use later
    stripnum = re.compile("s_[0-9]+$")
    manyn = re.compile("[N,n]{20,}")
    # get names of loci and taxa
    uces = get_uce_names_from_probes(args.probes, regex=stripnum, repl='s', lower=True)
    taxa = get_taxa_names_from_fastas(args.fasta)
    print "\n"
    if not args.extend:
        if args.db is None:
            db = os.path.join(args.output, 'probe.matches.sqlite')
        else:
            db = args.db
        # create db to hold results
        conn, c = create_probe_database(
                db,
                taxa,
                uces,
                True
            )
    else:
        conn, c = extend_probe_database(
                args.db,
                taxa
            )
    # get duplicate probe sequences for filtering
    if args.dupefile:
        print "Determining duplicate probes..."
        dupes = get_dupes(args.dupefile, longfile=False)
    else:
        dupes = None
    # because of structure, strip probe designation from dupes
    # leaving only locus name.  lowercase all.
    dupes = set([re.sub(stripnum, 's', d).lower() for d in dupes])
    # iterate over LASTZ files for each taxon
    for lz in glob.glob(os.path.join(args.lastz, '*')):
        # get taxon name from lastz file
        taxon = get_taxon_from_filename(lz)
        print "\n{0}\n{1}\n{0}".format('=' * 30, taxon)
        # get fasta name from lastz file
        ff = get_fasta_name_from_lastz_pth(lz, args.fasta)
        # get lastz matches
        print "\tGetting LASTZ matches from GENOME alignments..."
        if not args.oldprobe:
            matches, probes = get_bgi_matches(lz, stripnum)
        else:
            matches, probes = get_old_probe_matches(lz)
        # remove bad loci (dupes)
        print "\tGetting bad (potentially duplicate) GENOME matches..."
        loci_to_skip = []
        for k, v in matches.iteritems():
            # check matches to makes sure all is well - keep names lc
            loci_to_skip.extend(quality_control_matches(matches, probes, dupes, k, v, False))
        #pdb.set_trace()
        # convert to set, to keep only uniques
        loci_to_skip = set(loci_to_skip)
        print "\tSkipping {} bad (duplicate hit) loci...".format(len(loci_to_skip))
        # get (and possibly assemble) non-skipped
        seqdict = defaultdict(list)
        # determine those contigs to skip and group those to assemble
        for contig in fasta.FastaReader(ff):
            # make sure all names are lowercase
            contig.identifier = contig.identifier.lower()
            if not args.oldprobe:
                name = contig.identifier.split('|')[-3]
                locus = re.sub(stripnum, 's', name)
            else:
                locus = contig.identifier.split('|')[-5]
            # skip what we identified as bad loci
            if locus not in loci_to_skip:
                seqdict[locus].append(contig)
        output_name = "{}.fasta".format(taxon.replace('_', '-'))
        fout_name = os.path.join(args.output, output_name)
        print "\tOutput filename is {}".format(output_name)
        fout = fasta.FastaWriter(fout_name)
        # this tracks "fake" contig number
        count = 0
        # this tracks loci kept
        kept = 0
        # when > 1 contig, assemble contigs across matches
        sys.stdout.write("\tWriting and Aligning/Assembling UCE loci with multiple probes (dot/1000 loci)")
        for k, v in seqdict.iteritems():
            bad = False
            contig_names = []
            if count % 1000 == 0:
                sys.stdout.write('.')
                sys.stdout.flush()
            if len(v) == 1:
                # trim ambiguous bases on flanks
                record = v[0]
                orient = [matches[k][0][1]]
                if args.flank:
                    record = trim_uce_reads(record, args.flank)
                contig_names.append(record.identifier)
                record.sequence = record.sequence.strip('N')
                # trim many ambiguous bases within contig
                result = manyn.search(record.sequence)
                if result:
                    uce_start, uce_end = get_probe_positions(record)
                    uce = record.sequence[uce_start:uce_end]
                    record.sequence = snip_if_many_N_bases(manyn, k, record.sequence, uce, verbose=False)
                # change header
                record.identifier = ">Node_{0}_length_{1}_cov_1000".format(
                        count,
                        len(record.sequence)
                    )
                fout.write(v[0])
            else:
                orient = list(set([m[1] for m in matches[k]]))
                # skip any loci having matches of mixed orientation
                # ['+', '-']
                if len(orient) == 1:
                    # create tempfile for the reads
                    fd, temp = tempfile.mkstemp(suffix='.fasta')
                    os.close(fd)
                    temp_out = fasta.FastaWriter(temp)
                    # write all slices to outfile, trimming if we want
                    #pdb.set_trace()
                    for record in v:
                        if args.flank:
                            record = trim_uce_reads(record, args.flank)
                        # keep names of contigs we assembled to store in db assoc
                        # w/ resulting assembled contig name
                        contig_names.append(record.identifier)
                        record.sequence = record.sequence.strip('N')
                        # trim many ambiguous bases within contig
                        result = manyn.search(record.sequence)
                        if result:
                            uce_start, uce_end = get_probe_positions(record)
                            uce = record.sequence[uce_start:uce_end]
                            record.sequence = snip_if_many_N_bases(manyn, k, record.sequence, uce, verbose=False)
                        temp_out.write(record)
                    # make sure to close the file
                    temp_out.close()
                    # assemble
                    aln = Align(temp)
                    aln.run_alignment()
                    record = fasta.FastaSequence()
                    record.sequence = aln.alignment_consensus.tostring()
                    record.identifier = ">Node_{0}_length_{1}_cov_1000".format(
                            count,
                            len(record.sequence)
                        )
                    fout.write(record)
                else:
                    bad = True
            if not bad:
                # track contig assembly and renaming data in db
                q = "UPDATE matches SET {0} = 1 WHERE uce = '{1}'".format(taxon, k)
                c.execute(q)
                # generate db match and match map tables for data
                orient_key = "node_{0}({1})".format(count, orient[0])
                q = "UPDATE match_map SET {0} = '{1}' WHERE uce = '{2}'".format(taxon, orient_key, k)
                c.execute(q)
                # keep track of new name :: old name mapping
                for old_name in contig_names:
                    q = "INSERT INTO contig_map VALUES ('{0}', '{1}', '{2}', '{3}')".format(taxon, k, old_name, record.identifier)
                    c.execute(q)
                kept += 1
            # tracking "fake" contig number
            count += 1
        conn.commit()
        print "\n\t{0} loci of {1} matched ({2:.0f}%), {3} dupes dropped ({4:.0f}%), {5} ({6:.0f}%) kept".format(
            count,
            len(uces),
            float(count) / len(uces) * 100,
            len(loci_to_skip),
            float(len(loci_to_skip)) / len(uces) * 100,
            kept,
            float(kept) / len(uces) * 100
            )
    #conn.commit()
    c.close()
    conn.close()
Ejemplo n.º 11
0
def main():
    args = get_args()
    if args.dupefile:
        dupes = get_dupes(args.dupefile)
    else:
        dupes = None
    used = set()
    mx = max(
        [
            int(row.strip("\n").split(",")[3]) - int(row.strip("\n").split(",")[2])
            for row in open(args.dbsnp, "rU")
            if not row.startswith("UCE")
        ]
    )
    # get the SNP metadata
    all_snps = get_xml_data(args.xml)
    # find the middle
    overall_middle = int(round(mx / 2, 0))
    # list to hold results
    l = numpy.zeros(mx + 1)
    positions = copy.deepcopy(l)
    # create a dict to hold the results by position in longest array
    # differences = dict((d,numpy.array([])) for d in range(-middle, middle + 1))
    # iterate over intersections
    d = {}
    if args.output2:
        args.output2.write("UCE,chromo,uce-start,uce-end,snp-name,snp-start,snp-end,1000gvalidated,freq\n")
    for row in open(args.dbsnp, "rU"):
        if not row.startswith("UCE"):
            uce, chromo, start, end, snp, snps, snpe = row.strip("\n").split(",")
            start, end, snps, snpe = map(int, [start, end, snps, snpe])
            # get middle of this UCE
            middle = int(round((start + end) / 2, 0))
            # pdb.set_trace()
            if snp not in used:
                if (
                    not snpe - snps > 1
                    and (uce not in dupes)
                    and all_snps[snp.strip("rs")].val_1000G == "true"
                    and all_snps[snp.strip("rs")].freq_freq is not None
                ):
                    if not uce in d.keys():
                        d[uce] = numpy.zeros(mx + 1)
                    rel_snp_pos = snps - middle
                    d[uce][overall_middle + rel_snp_pos] = all_snps[snp.strip("rs")].freq_freq
                if args.output2 and not snpe - snps > 1 and (snp not in used) and (uce not in dupes):
                    args.output2.write(
                        "{},{},{},{},{},{},{},{},{}\n".format(
                            uce,
                            chromo,
                            start,
                            end,
                            snp,
                            snps,
                            snpe,
                            all_snps[snp.strip("rs")].val_1000G,
                            all_snps[snp.strip("rs")].freq_freq,
                        )
                    )
                used.add(snp)
    stack = numpy.array([d[uce] for uce in d.keys()])
    # pdb.set_trace()
    # compute the running average
    win = 25
    data = sum(stack > 0)
    weightings = numpy.repeat(1.0, win) / win
    running = numpy.convolve(data, weightings)[win - 1 : -(win - 1)]
    args.output.write("pos,avg,ci,datatype\n")
    for base in range(len(running)):
        pos = base - overall_middle
        args.output.write("{},{},,running\n".format(pos, running[base]))
    # also output the average heterozygosity of 1000 Genome validated, hetero SNPs.
    for base in range(len(stack[0])):
        pos = base - overall_middle
        values = numpy.where(stack[:, base] != 0)[0]
        # reindex
        avg = numpy.mean(stack[:, base][values])
        ci = 1.96 * (numpy.std(stack[:, base][values], ddof=1) / numpy.sqrt(len(stack[:, base][values])))
        args.output.write("{},{},{},mean_hetero\n".format(pos, avg, ci))
    win = 25
    data = numpy.mean(stack, axis=1)
    weightings = numpy.repeat(1.0, win) / win
    running = numpy.convolve(data, weightings)[win - 1 : -(win - 1)]
    for base in range(len(stack[0])):
        pos = base - overall_middle
        args.output.write("{},{},,running_hetero\n".format(pos, running[base]))