Ejemplo n.º 1
0
def main():
    start_time = time.time()
    print 'Started: ', time.strftime("%a %b %d, %Y  %H:%M:%S",
                                     time.localtime(start_time))
    options, arg = interface()
    alignment = lastz.Align(options.target, options.query, options.coverage, \
        options.identity, options.output)
    lzstdout, lztstderr = alignment.run()
    if lztstderr:
        pdb.set_trace()
    end_time = time.time()
    print 'Ended: ', time.strftime("%a %b %d, %Y  %H:%M:%S",
                                   time.localtime(end_time))
    print 'Time for execution: ', (end_time - start_time) / 60, 'minutes'
Ejemplo n.º 2
0
def main(args):
    #args = get_args()
    pre_regex = args.regex
    regex = re.compile("^(%s)(?:.*)" % pre_regex)
    if not os.path.isdir(args.output):
        os.makedirs(args.output)
    else:
        raise IOError(
            "The directory {} already exists.  Please check and remove by hand."
            .format(args.output))
    exons = set(
        new_get_probe_name(seq.id, regex)
        for seq in SeqIO.parse(open(args.reference, 'rU'), 'fasta'))
    #print exons
    if args.dupefile:
        dupes = get_dupes(log, args.dupefile, regex)
    else:
        dupes = set()
    fasta_files = glob.glob(os.path.join(args.contigs, '*.fa*'))
    for f in fasta_files:
        replace_bad_fasta_chars = "sed -i -e '/>/! s=[K,Y,R,S,M,W,B,D,H,V,k,y,r,s,m,w,b,d,h,v]=N=g' %s" % f
        remove_os_sed_copies = "rm %s/*-e " % args.contigs
        fasta_name = f.split('/')[-1]
        if not fasta_name.startswith('sample'):
            rename_samples = "mv %s %s/sample_%s" % (f, args.contigs,
                                                     fasta_name)
            os.system(rename_samples)
        os.system(replace_bad_fasta_chars)
        os.system(remove_os_sed_copies)
    fasta_files = glob.glob(os.path.join(args.contigs, '*.fa*'))
    organisms = get_organism_names_from_fasta_files(fasta_files)
    conn, c = create_probe_database(
        log, os.path.join(args.output, 'probe.matches.sqlite'), organisms,
        exons)
    log.info("Processing contig data")
    # open a file for duplicate writing, if we're interested
    if args.keep_duplicates is not None:
        dupefile = open(args.keep_duplicates, 'w')
    else:
        dupefile = None
    log.info("{}".format("-" * 65))
    kmers = {}
    for contig in sorted(fasta_files):
        critter = os.path.basename(contig).split('.')[0].replace('-', "_")
        output = os.path.join(
            args.output,
            os.path.splitext(os.path.basename(contig))[0] + '.lastz')
        contigs = contig_count(contig)
        # align the probes to the contigs
        alignment = lastz.Align(contig, args.reference, args.min_coverage,
                                args.min_identity, output)
        lzstdout, lztstderr = alignment.run()
        if lztstderr:
            raise EnvironmentError("lastz: {}".format(lztstderr))
        # parse the lastz results of the alignment
        matches = defaultdict(set)
        orientation = defaultdict(set)
        revmatches = defaultdict(set)
        probe_dupes = set()
        if not lztstderr:
            for lz in lastz.Reader(output):
                contig_name = get_contig_name(lz.name1, args)
                exon_name = new_get_probe_name(lz.name2, regex)
                if args.dupefile and exon_name in dupes:
                    probe_dupes.add(exon_name)
                else:
                    matches[contig_name].add(exon_name)
                    orientation[exon_name].add(lz.strand2)
                    revmatches[exon_name].add(contig_name)

        # we need to check nodes for dupe matches to the same probes
        contigs_matching_mult_exons = check_contigs_for_dupes(matches)
        exon_dupe_contigs, exon_dupe_exons = check_loci_for_dupes(revmatches)
        nodes_to_drop = contigs_matching_mult_exons.union(exon_dupe_contigs)
        # write out duplicates if requested
        if dupefile is not None:
            log.info("Writing duplicates file for {}".format(critter))
            if len(exon_dupe_exons) != 0:
                dupefile.write(
                    "[{} - probes hitting multiple contigs]\n".format(critter))
                for exon in exon_dupe_exons:
                    dupefile.write("{}:{}\n".format(
                        exon, ', '.join(revmatches[exon])))
                dupefile.write("\n")
            if len(contigs_matching_mult_exons) != 0:
                dupefile.write(
                    "[{} - contigs hitting multiple probes]\n".format(critter))
                for dupe in contigs_matching_mult_exons:
                    dupefile.write("{}:{}\n".format(dupe,
                                                    ', '.join(matches[dupe])))
                dupefile.write("\n")
                dupefile.write("[{} - contig orientation]\n".format(critter))
                for dupe in contigs_matching_mult_exons:
                    matches_list = list(matches[dupe])
                    for exon in matches_list:
                        dupefile.write("{}:{}\n".format(
                            exon,
                            list(orientation[exon])[0]))
                dupefile.write("\n")

        # remove dupe and/or dubious nodes/contigs
        match_copy = copy.deepcopy(matches)
        for k in match_copy.keys():
            if k in nodes_to_drop:
                del matches[k]
        #print matches
        #print lz.name1
        #get contig id
        #contig_id = re.search("^(\d*)\s\d*\s\d*.*", lz.name1).groups()[0]
        #print matches

        #added function to return the kmer count (sum of all kmers of target contigs)
        for lz in lastz.Reader(output):
            for element in matches:
                #print element, "has to match", lz[1]
                if re.search("^(\d*)\s\d*\s\d*.*",
                             lz[1]).groups()[0] == element:
                    kmer_value = get_kmer_value(lz.name1)
                    kmers.setdefault(contig, [])
                    kmers[contig].append(kmer_value)
        store_lastz_results_in_db(c, matches, orientation, critter)
        conn.commit()
        pretty_log_output(log, critter, matches, contigs, probe_dupes,
                          contigs_matching_mult_exons, exon_dupe_exons)

    kmerfile = open(os.path.join(args.output, 'kmer_count.txt'), 'w')

    for key in kmers:
        count = 0
        for element in kmers[key]:
            count += int(element)
        kmerfile.write("%s : %d\n" %
                       (os.path.basename(key).split('.')[0], count))

    if dupefile is not None:
        dupefile.close()
    log.info("{}".format("-" * 65))
    log.info("The LASTZ alignments are in {}".format(args.output))
    log.info("The exon match database is in {}".format(
        os.path.join(args.output, "probes.matches.sqlite")))
    text = "Completed"

    log.info(text.center(65, "="))

    # Access the SQL file and export tab-separated text-file
    sql_file = os.path.join(args.output, 'probe.matches.sqlite')
    tsf_out = os.path.join(args.output, 'match_table.txt')
    sql_cmd = "%s -header -nullvalue '.' -separator '\t' %s \"select * from matches;\" > %s" % (
        args.sqlite3, sql_file, tsf_out)
    os.system(sql_cmd)

    # Create the config file for the extraction of the desired loci
    output_folder = args.output

    with open(os.path.join(output_folder, 'config'), 'w') as f:
        print('[Organisms]', file=f)
        for aln in glob.glob(os.path.join(output_folder, '*.lastz')):
            aln = os.path.basename(aln)
            #aln = aln.split('_')[0]
            aln = aln.replace('.lastz', '')
            print(aln, file=f)

        print('\n[Loci]', file=f)
        with open(os.path.join(output_folder,
                               'match_table.txt')) as match_table:
            lines = match_table.readlines()
        for line in lines[1:]:
            print(line.split('\t')[0], file=f)
Ejemplo n.º 3
0
def main():
    args = get_args()
    if args.regex and args.repl is not None:
        # "s_[0-9]+$"
        regex = re.compile(args.regex)
        uces = set([get_name(read.identifier, "|", 1, regex=regex, repl=args.repl)
            for read in fasta.FastaReader(args.query)])
    else:
        uces = set([get_name(read.identifier, "|", 1)
            for read in fasta.FastaReader(args.query)])
        regex = None
    if args.dupefile:
        print "\t Getting dupes"
        dupes = get_dupes(args.dupefile, regex, args.repl)
    contig = args.contigs#glob.glob(os.path.join(args.contigs, '*.fa*'))
    organisms = ["contigs"]#get_organism_names_from_fasta_files(fasta_files)
    conn, c = create_probe_database( uces )
    print "Processing:"
    #for contig in fasta_files:
    critter = os.path.basename(contig).split('.')[0].replace('-', "_")
    #output = args.align 
    # os.path.join(
    #         args.align, \
    #         os.path.splitext(os.path.basename(contig))[0] + '.lastz'
    #      )
    contigs = contig_count(contig)
    # align the probes to the contigs
    alignment = lastz.Align(
              contig,
              args.query,
              args.coverage,
              args.identity,
              args.align 
            )
    lzstdout, lztstderr = alignment.run()
    # parse the lastz results of the alignment
    matches, orientation, revmatches = \
                defaultdict(set), defaultdict(set), defaultdict(set)
    probe_dupes = set()
    if not lztstderr:
        for lz in lastz.Reader(args.align ):
            # get strandedness of match
            contig_name = get_name(lz.name1)
            uce_name = get_name(lz.name2, "|", 1, regex=regex, repl=args.repl)
            if args.dupefile and uce_name in dupes:
                probe_dupes.add(uce_name)
            else:
                matches[contig_name].add(uce_name)
                orientation[uce_name].add(lz.strand2)
                revmatches[uce_name].add(contig_name)
    else:
        print "Error in lastz:"
        print "STDerr:"
        print lztstderr
        print "STDout:"
        print lzstdout

    # we need to check nodes for dupe matches to the same probes
    contigs_matching_mult_uces = check_contigs_for_dupes(matches)
    uces_matching_mult_contigs = check_probes_for_dupes(revmatches)
    nodes_to_drop = contigs_matching_mult_uces
    nodes_to_drop_one_of = uces_matching_mult_contigs
    # remove dupe and/or dubious nodes/contigs
    match_copy = copy.deepcopy(matches)
    already_observed = list()
    for k in match_copy.keys():
        if k in nodes_to_drop:
            del matches[k]
        elif k in nodes_to_drop_one_of:
        	if matches[k] in already_observed:
        		del matches[k]
        	else:
        		already_observed.append(matches[k])
    store_lastz_results_in_db(c, matches, orientation, critter)
    conn.commit()
    pretty_print_output(
                critter,
                matches,
                contigs,
                probe_dupes,
                contigs_matching_mult_uces,
                uces_matching_mult_contigs
            )
    # get all the UCE records from the db
    query = "SELECT uce, {0} FROM match_map WHERE {0} IS NOT NULL".format("contigs")
    c.execute(query)
    data = {row[1].split("(")[0]:row[0] for row in c.fetchall()}
    nodenames = set(data.keys())
    # make sure we don't lose any dupes
    assert len(data) == len(nodenames), "There were duplicate contigs."
    outp = open(args.output, 'w')
    print "Building UCE fasta:"
    #for contig in fasta_files:
    for record in SeqIO.parse(open(contig), 'fasta'):
        name = '_'.join(record.id.split('_')[:2])
        if name.lower() in nodenames:
            record.id = "{0}|{1}".format(data[name.lower()], record.id)
            outp.write(record.format('fasta'))
    outp.close()
Ejemplo n.º 4
0
def main():
    args = get_args()
    if args.regex and args.repl is not None:
        # "s_[0-9]+$"
        regex = re.compile(args.regex)
        uces = set([
            get_name(read.identifier, "|", 1, regex=regex, repl=args.repl)
            for read in fasta.FastaReader(args.query)
        ])
    else:
        uces = set([
            get_name(read.identifier, "|", 1)
            for read in fasta.FastaReader(args.query)
        ])
        regex = None
    if args.dupefile:
        print "\t Getting dupes"
        dupes = get_dupes(args.dupefile, regex, args.repl)
    fasta_files = glob.glob(os.path.join(args.contigs, '*.fa*'))
    organisms = get_organism_names_from_fasta_files(fasta_files)
    conn, c = create_probe_database(
        os.path.join(args.output, 'probe.matches.sqlite'), organisms, uces)
    print "Processing:"
    for contig in fasta_files:
        critter = os.path.basename(contig).split('.')[0].replace('-', "_")
        output = os.path.join(
                    args.output, \
                    os.path.splitext(os.path.basename(contig))[0] + '.lastz'
                )
        contigs = contig_count(contig)
        # align the probes to the contigs
        alignment = lastz.Align(contig, args.query, args.coverage,
                                args.identity, output)
        lzstdout, lztstderr = alignment.run()
        # parse the lastz results of the alignment
        matches, orientation, revmatches = \
                defaultdict(set), defaultdict(set), defaultdict(set)
        probe_dupes = set()
        if not lztstderr:
            for lz in lastz.Reader(output):
                # get strandedness of match
                contig_name = get_name(lz.name1)
                uce_name = get_name(lz.name2,
                                    "|",
                                    1,
                                    regex=regex,
                                    repl=args.repl)
                if args.dupefile and uce_name in dupes:
                    probe_dupes.add(uce_name)
                else:
                    matches[contig_name].add(uce_name)
                    orientation[uce_name].add(lz.strand2)
                    revmatches[uce_name].add(contig_name)
        # we need to check nodes for dupe matches to the same probes
        contigs_matching_mult_uces = check_contigs_for_dupes(matches)
        uces_matching_mult_contigs = check_probes_for_dupes(revmatches)
        nodes_to_drop = contigs_matching_mult_uces.union(
            uces_matching_mult_contigs)
        # remove dupe and/or dubious nodes/contigs
        match_copy = copy.deepcopy(matches)
        for k in match_copy.keys():
            if k in nodes_to_drop:
                del matches[k]
        store_lastz_results_in_db(c, matches, orientation, critter)
        conn.commit()
        pretty_print_output(critter, matches, contigs, probe_dupes,
                            contigs_matching_mult_uces,
                            uces_matching_mult_contigs)