def alignmentTargets(genome_files, contig_files):
    '''
    generator object to produce filenames for 
    aligning contigs to known ncbi genomes
    '''
    parameters = []
    for genome, contig in itertools.product(genome_files, contig_files):
        outfile = os.path.join(
            "alignment.dir",
            P.snip(contig, ".contigs.fa") + "_vs_" +
            P.snip(os.path.basename(genome), ".fna")) + ".delta"
        additional_input = add_inputs(contig)
        parameters.append([outfile, genome, contig])
    return parameters
def filterContigsByCoverage(infiles, outfile):
    '''
    filter contigs by their average base coverage
    '''
    fcoverage = PARAMS["coverage_filter"]
    contig_file = infiles[0]
    dbh = sqlite3.connect(PARAMS["database"])
    cc = dbh.cursor()
    for infile in infiles[1:]:
        print(contig_file, P.snip(os.path.basename(infile), ".load"))
def chimeraTargets(alignment_files, contig_files):
    '''
    generator object to produce filenames for 
    scoring chimericity
    '''
    parameters = []
    for alignment, contig in itertools.product(genome_files, contig_files):
        outfile = os.path.join("chimeras.dir",
                               P.snip(alignment, ".bam") + ".chimeras")
        parameters.append([outfile, alignment, contig])
    return parameters
def collectGenomeSizes(infile, outfile):
    '''
    output the genome sizes for each genome
    '''
    to_cluster = True
    outf = open(outfile, "w")
    outf.write("genome\tlength\n")
    # assume single fasta entry
    for fasta in FastaIterator.iterate(iotools.openFile(infile)):
        name = P.snip(os.path.basename(infile), ".fna")
        length = len(list(fasta.sequence))
        outf.write("%s\t%s\n" % (name, str(length)))
    outf.close()
def calculateFalsePositiveRate(infiles, outfile):
    '''
    calculate the false positive rate in taxonomic
    abundances
    '''

    # connect to database
    dbh = sqlite3.connect(PARAMS["database"])
    cc = dbh.cursor()

    true_file = infiles[0]
    true_set = set()
    estimate_set = set()
    for estimate_file in infiles[1:]:
        if os.path.basename(estimate_file)[
                len("metaphlan_"):] == os.path.basename(true_file):
            tablenames = [
                P.toTable(os.path.basename(true_file)),
                P.toTable(os.path.basename(estimate_file))
            ]

            for species in cc.execute("""SELECT species_name FROM %s""" %
                                      tablenames[0]).fetchall():
                true_set.add(species[0])
            for species in cc.execute(
                    """SELECT taxon FROM %s WHERE taxon_level == 'species'""" %
                    tablenames[1]).fetchall():
                if species[0].find("_unclassified") != -1: continue
                estimate_set.add(species[0])

    total_estimate = len(estimate_set)
    total_true = len(true_set)

    E.info("counting false positives and false negatives")
    print(estimate_set.difference(true_set))
    nfp = len(estimate_set.difference(true_set))
    nfn = len(true_set.difference(estimate_set))
    ntp = len(estimate_set.intersection(true_set))

    E.info("writing results")
    track = P.snip(os.path.basename(true_file), ".load")
    outf = open(outfile, "w")
    outf.write("track\ttp_rate\tfp_rate\tfn_rate\n")
    outf.write("\t".join(
        map(str, [
            track,
            float(ntp) / total_estimate,
            float(nfp) / total_estimate,
            float(nfn) / total_true
        ])) + "\n")
    outf.close()
def alignContigsToReference(outfile, param1, param2):
    '''
    align the contigs to the reference genomes
    using nucmer
    '''
    to_cluster = True

    reffile, contigfile = param1, param2
    pattern = P.snip(os.path.basename(outfile), ".delta")
    statement = '''nucmer -p %(pattern)s %(reffile)s %(contigfile)s'''
    P.run()
    outf = os.path.basename(outfile)
    statement = '''mv %(outf)s alignment.dir'''
    P.run()
def buildAlignmentSizes(infiles, outfile):
    '''
    use bed files to sum the total number of bases
    that are aligned to the genomes
    '''
    outf = open(outfile, "w")
    outf.write("genome\tsize\n")
    for infile in infiles:
        genome = P.snip(os.path.basename(infile), ".bed.gz")
        c = 0
        inf = iotools.openFile(infile)
        for bed in Bed.iterator(inf):
            c += bed.end - bed.start
        outf.write("%s\t%s\n" % (genome, str(c)))
    outf.close()
def plotRelativeAbundanceCorrelations(infiles, outfile):
    '''
    plot the correlation between the estimated 
    relative abundance of species and the true
    relative abundances - done on the shared set
    '''
    # connect to database
    dbh = sqlite3.connect(PARAMS["database"])
    cc = dbh.cursor()

    true_file = infiles[0]
    temp = P.getTempFile()
    temp.write("true\testimate\n")
    for estimate_file in infiles[1:]:
        if os.path.basename(estimate_file)[
                len("metaphlan_"):] == os.path.basename(true_file):
            tablenames = [
                P.toTable(os.path.basename(true_file)),
                P.toTable(os.path.basename(estimate_file))
            ]
            # get data
            statement = """SELECT a.relab, b.rel_abundance
                           FROM %s as a, %s as b
                           WHERE b.taxon_level == "species"
                           AND a.species_name == b.taxon""" % (tablenames[0],
                                                               tablenames[1])
            for data in cc.execute(statement).fetchall():
                true, estimate = data[0], data[1]
                temp.write("%f\t%f\n" % (true, estimate))
    temp.close()
    print(temp.name)

    inf = temp.name
    R('''data <- read.csv("%s", header = T, stringsAsFactors = F, sep = "\t")'''
      % inf)
    R('''png("%s")''' % outfile)
    main_name = P.snip(outfile, ".png")
    R('''data$estimate <- data$estimate/100''')
    R('''plot(data$estimate, data$true, pch = 16, main = "%s", xlab = "estimated relative abundance", ylab = "observed relative abundance")'''
      % main_name)
    R('''text(0.05, y = 0.35, labels = paste("r = ", round(cor(data$estimate, data$true),2)), cex = 2)'''
      )
    R["dev.off"]()
    os.unlink(inf)