Example #1
0
    def maskSequences(self, sequences):
        '''mask a collection of sequences.'''

        outfile, infile = tempfile.mkstemp()

        for x, s in enumerate(sequences):
            os.write(outfile, ">%i\n%s\n" % (x, s))

        os.close(outfile)

        statement = self.mCommand % locals()

        E.debug("statement: %s" % statement)

        s = subprocess.Popen(statement,
                             shell=True,
                             stdout=subprocess.PIPE,
                             stderr=subprocess.PIPE,
                             close_fds=True)

        (out, err) = s.communicate()

        if s.returncode != 0:
            raise RuntimeError(
                "Error in running %s \n%s\nTemporary directory" %
                (statement, err))

        result = [
            x.sequence for x in FastaIterator.iterate(StringIO.StringIO(out))]

        os.remove(infile)

        return result
Example #2
0
    def maskSequences(self, sequences):
        '''mask a collection of sequences.'''

        with tempfile.NamedTemporaryFile(mode="w+t", delete=False) as outf:
            for x, s in enumerate(sequences):
                outf.write(">%i\n%s\n" % (x, s))

        infile = outf.name
        statement = self.mCommand % locals()

        E.debug("statement: %s" % statement)

        s = subprocess.Popen(statement,
                             shell=True,
                             stdout=subprocess.PIPE,
                             stderr=subprocess.PIPE,
                             close_fds=True)

        (out, err) = s.communicate()

        if s.returncode != 0:
            raise RuntimeError(
                "Error in running %s \n%s\nTemporary directory" %
                (statement, err))

        result = [
            x.sequence for x in FastaIterator.iterate(StringIO(out.decode()))
        ]

        os.remove(infile)

        return result
Example #3
0
    def maskSequences(self, sequences):
        '''mask a collection of sequences.'''

        outfile, infile = tempfile.mkstemp()

        for x, s in enumerate(sequences):
            os.write(outfile, ">%i\n%s\n" % (x, s))

        os.close(outfile)

        statement = self.mCommand % locals()

        E.debug("statement: %s" % statement)

        s = subprocess.Popen(statement,
                             shell=True,
                             stdout=subprocess.PIPE,
                             stderr=subprocess.PIPE,
                             close_fds=True)

        (out, err) = s.communicate()

        if s.returncode != 0:
            raise RuntimeError(
                "Error in running %s \n%s\nTemporary directory" %
                (statement, err))

        result = [x.sequence for x in FastaIterator.iterate(StringIO(out))]

        os.remove(infile)

        return result
def filterByCoverage(infiles, outfile):

    fcoverage = PARAMS["coverage_filter"]
    contig_file = infiles[0]
    dbh = sqlite3.connect(
        os.path.join(PARAMS["results_resultsdir"], PARAMS["database"]))
    cc = dbh.cursor()
    contigs = set()
    for infile in infiles[1:]:
        dirsplit = infile.split("/")
        infile = os.path.join(
            PARAMS["results_resultsdir"],
            dirsplit[-2].split(".dir")[0] + "-" + dirsplit[-1])
        tablename = P.toTable(os.path.basename(infile))
        if P.snip(contig_file, ".fa") == P.snip(os.path.basename(infile),
                                                ".coverage.load"):
            statement = """SELECT contig_id ave FROM
                           (SELECT contig_id, AVG(coverage) as ave FROM %s GROUP BY contig_id)
                           WHERE ave > %i""" % (tablename,
                                                PARAMS["coverage_filter"])
            for data in cc.execute(statement).fetchall():
                contigs.add(data[0])
    outf = open(outfile, "w")
    print contigs
    for fasta in FastaIterator.iterate(IOTools.openFile(contig_file)):
        identifier = fasta.title.split(" ")[0]
        if identifier in contigs:
            outf.write(">%s\n%s\n" % (identifier, fasta.sequence))
    outf.close()
Example #5
0
def calculateSequenceComposition(interval_names,
                                 sequence_file,
                                 outfile,
                                 header_line=True):
    '''
    given a set of interval names that are present in a
    fasta file, return CpG content file
    '''
    interval_file = open(interval_names)
    if header_line:
        interval_file.readline()
    sequence_file = open(sequence_file)

    interval_set = set()
    for line in interval_file.readlines():
        interval_set.add(line[:-1])

    temp = P.getTempFile("/ifs/scratch")
    for record in FastaIterator.iterate(sequence_file):
        seq_id = record.title.split(" ")[0]
        if seq_id in interval_set:
            temp.write(">%s\n%s\n" % (record.title, record.sequence))
    temp.close()

    inf = temp.name
    statement = '''cat %(inf)s | python %(scriptsdir)s/fasta2table.py
                   -s cpg -s length
                   --log=%(outfile)s.log > %(outfile)s'''
    P.run()
Example #6
0
def buildInputFiles(infile, outfiles):
    '''
    build input file based on parameters and fasta sequences
    that primers are to be designed for
    '''
    PARAMS["constraints_primer_mispriming_library"] = glob.glob(
        "mispriming.dir/*.lib")[0]

    fasta, identifiers = infile[0], "identifiers.tsv"
    inf = IOTools.openFile(fasta)

    E.info("Reading ids for primer design")
    ids = readIdentifiers(identifiers)

    E.info("collecting sequences")
    for f in FastaIterator.iterate(IOTools.openFile(fasta)):
        if f.title in ids:
            outf = IOTools.openFile(
                os.path.join(
                    "input.dir",
                    f.title.replace(" ", "_").replace("/", "_") +
                    ".input").replace('"', ''), "w")
            seq = f.sequence
            outf.write("SEQUENCE_ID=%s\n" % f.title)
            for key, value in PARAMS.iteritems():
                if "constraints" in key:
                    outf.write(
                        "%s=%s\n" %
                        (key.replace("constraints_", "").upper(), value))
            outf.write("SEQUENCE_TEMPLATE=%s\n=\n" % seq)
            outf.close()
def main(argv=None):
    """script main.

    parses command line options in sys.argv, unless *argv* is given.
    """

    if not argv:
        argv = sys.argv

    # setup command line parser
    parser = optparse.OptionParser(version="%prog version: $Id: script_template.py 2871 2010-03-03 10:20:44Z andreas $",
                                   usage=globals()["__doc__"])

    parser.add_option("-g", "--genome-dir", dest="genome_dir", type="string",
                      help="supply help")

    # add common options (-h/--help, ...) and parse command line
    (options, args) = E.Start(parser, argv=argv)

    contigs_map = {}
    for genome in glob.glob(os.path.join(options.genome_dir, "*")):
        for fasta in FastaIterator.iterate(IOTools.openFile(genome)):
            identifier = fasta.title.split("|")
            gi = identifier[1]
            contigs_map[gi] = fasta.title

    for line in options.stdin.readlines():
        data = line[:-1].split("\t")
        gi = data[1]
        assert gi in contigs_map, "cannot find genome with id gi|%s in genomes directory" % gi

        options.stdout.write("%s\t%s\n" % (data[0], contigs_map[gi]))

    # write footer and output benchmark information.
    E.Stop()
Example #8
0
def main(argv=None):
    """script main.

    parses command line options in sys.argv, unless *argv* is given.
    """

    if not argv:
        argv = sys.argv

    # setup command line parser
    parser = optparse.OptionParser(version="%prog version: $Id: script_template.py 2871 2010-03-03 10:20:44Z andreas $",
                                   usage=globals()["__doc__"])

    parser.add_option("-b", "--bamfile", dest="bamfile", type="string",
                      help="supply bam file")

    # add common options (-h/--help, ...) and parse command line
    (options, args) = E.Start(parser, argv=argv)

    # read in contigs
    E.info("reading in contig file")
    contigs = {}
    for fasta in FastaIterator.iterate(options.stdin):
        contigs[fasta.title] = (1, len(fasta.sequence) - 1)
    E.info("read %i contigs" % len(contigs.keys()))

    # read in bamfile
    E.info("reading bam file")
    samfile = pysam.Samfile(options.bamfile)

    E.info("iterating over contigs")
    c = 0
    for contig, coords in contigs.iteritems():
        coords = list(coords)

        #################################
        # NB this is specific for my data!
        contig = contig.split(" ")[0]
        #################################

        species_counts = collections.defaultdict(int)
        for alignment in samfile.fetch(contig, coords[0], coords[1]):
            species_id = alignment.qname.split("|")[1]
            species_counts[species_id] += 1

        # at the moment ignore if there are no counts
        if len(species_counts.values()) == 0:
            E.warn("no reads map to %s" % contig)
            continue

        for species, count in species_counts.iteritems():
            if species_counts[species] == max(species_counts.values()):
                top_dog = species
                c += 1
                break
        E.info("species %s assigned to contig number %i" % (top_dog, c))
        options.stdout.write("%s\t%s\n" % (contig, top_dog))

    # write footer and output benchmark information.
    E.Stop()
def calculateSequenceComposition(interval_names,
                                 sequence_file,
                                 outfile,
                                 header_line=True):
    '''
    given a set of interval names that are present in a
    fasta file, return CpG content file
    '''
    interval_file = open(interval_names)
    if header_line:
        interval_file.readline()
    sequence_file = open(sequence_file)

    interval_set = set()
    for line in interval_file.readlines():
        interval_set.add(line[:-1])

    temp = P.getTempFile("/ifs/scratch")
    for record in FastaIterator.iterate(sequence_file):
        seq_id = record.title.split(" ")[0]
        if seq_id in interval_set:
            temp.write(">%s\n%s\n" % (record.title, record.sequence))
    temp.close()

    inf = temp.name
    statement = '''
    cat %(inf)s | cgat fasta2table
    -s na -s cpg -s length
    --log=%(outfile)s.log > %(outfile)s'''

    P.run()
def countCompleteGenes(infile, outfile):
    '''
    count the number of genes that are classed
    as complete based on having a start and stop codon
    '''
    start = "ATG"
    stop = ["TAG", "TAA", "TGA"]

    ntotal = 0
    nstart = 0
    nstop = 0
    nstart_nstop = 0
    for fasta in FastaIterator.iterate(IOTools.openFile(infile)):
        ntotal += 1
        if fasta.sequence.startswith(start):
            nstart += 1
        if fasta.sequence[-3:len(fasta.sequence)] in stop:
            nstop += 1
        if fasta.sequence.startswith(
                start) and fasta.sequence[-3:len(fasta.sequence)] in stop:
            nstart_nstop += 1
    outf = open(outfile, "w")
    outf.write("total_genes\tpstart\tpstop\tpstart_stop\n")
    outf.write("\t".join(
        map(str, [
            ntotal,
            float(nstart) / ntotal,
            float(nstop) / ntotal,
            float(nstart_nstop) / ntotal
        ])) + "\n")
    outf.close()
Example #11
0
def buildInputFiles(infile, outfiles):
    '''
    build input file based on parameters and fasta sequences
    that primers are to be designed for
    '''
    PARAMS["constraints_primer_mispriming_library"] = glob.glob("mispriming.dir/*.lib")[0]

    fasta, identifiers = infile[0], "identifiers.tsv"
    inf = IOTools.openFile(fasta)
    
    E.info("Reading ids for primer design")
    ids = readIdentifiers(identifiers)
    
    E.info("collecting sequences")
    for f in FastaIterator.iterate(IOTools.openFile(fasta)):
        if f.title in ids:
            outf = IOTools.openFile(os.path.join(
                "input.dir",f.title.replace(" ", "_").replace("/","_") + ".input").replace('"', ''), "w")
            seq = f.sequence
            outf.write("SEQUENCE_ID=%s\n" % f.title)
            for key, value in PARAMS.iteritems():
                if "constraints" in key:
                    outf.write("%s=%s\n" % (key.replace("constraints_", "").upper(), value))
            outf.write("SEQUENCE_TEMPLATE=%s\n=\n" % seq)
            outf.close()
def filterByCoverage(infiles, outfile):

    fcoverage = PARAMS["coverage_filter"]
    contig_file = infiles[0]
    dbh = sqlite3.connect(
        os.path.join(PARAMS["results_resultsdir"], PARAMS["database"]))
    cc = dbh.cursor()
    contigs = set()
    for infile in infiles[1:]:
        dirsplit = infile.split("/")
        infile = os.path.join(
            PARAMS["results_resultsdir"], dirsplit[-2].split(".dir")[0] + "-" + dirsplit[-1])
        tablename = P.toTable(os.path.basename(infile))
        if P.snip(contig_file, ".fa") == P.snip(os.path.basename(infile), ".coverage.load"):
            statement = """SELECT contig_id ave FROM
                           (SELECT contig_id, AVG(coverage) as ave FROM %s GROUP BY contig_id)
                           WHERE ave > %i""" % (tablename, PARAMS["coverage_filter"])
            for data in cc.execute(statement).fetchall():
                contigs.add(data[0])
    outf = open(outfile, "w")
    print contigs
    for fasta in FastaIterator.iterate(IOTools.openFile(contig_file)):
        identifier = fasta.title.split(" ")[0]
        if identifier in contigs:
            outf.write(">%s\n%s\n" % (identifier, fasta.sequence))
    outf.close()
Example #13
0
def main( argv = None ):
    """script main.

    parses command line options in sys.argv, unless *argv* is given.
    """

    if argv == None: argv = sys.argv

    # setup command line parser
    parser = E.OptionParser( version = "%prog version: $Id$", 
                             usage = globals()["__doc__"] )

    parser.add_option("-n", dest="N", type="int",
                      help="e.g N50 - the length at which 50% of contigs are equal or above")
    parser.add_option("-f", "--filter-length", dest="filter_length", type="int",
                      help="calculate stats on contigs longer than -f")

    parser.set_defaults(N = 50,
                        filter_length = 0)

    ## add common options (-h/--help, ...) and parse command line 
    (options, args) = E.Start( parser, argv = argv )
    
    f = options.filter_length

    # iterate over the contigs/scaffolds and return stats                                                                                                                                                                                              
    number_of_contigs = 0

    N = options.N
    contig_lengths = []

    for record in FastaIterator.iterate(options.stdin):
        contig_length = len(list(record.sequence))
        if contig_length >= f:
            number_of_contigs += 1
            contig_lengths.append(contig_length)

    # mean, median and max contig/scaffold lengths
    mean_length = np.mean(contig_lengths)
    median_length = np.median(contig_lengths)
    max_length = max(contig_lengths)

    # iterate over contigs/scaffolds sorted by longest                                                                                                                                                                                               
    # and caculate the NX                                                                                                                                                                                                                    
    index = 0
    cum_length = 0
    total_length = sum(contig_lengths)
    for length in sorted(contig_lengths, reverse = True):
        while cum_length <= total_length*(float(N)/100):
            index += 1
            cum_length += length

    # output the results                                                                                                                                                                                                                     
    options.stdout.write("nscaffolds\tscaffold_length\tN%i\tmedian_length\tmean_length\tmax_length\n" % N)
    options.stdout.write("%s\t%s\t%s\t%s\t%s\t%s\n" % (number_of_contigs, total_length, sorted(contig_lengths, reverse = True)[index], str(median_length), str(mean_length), str(max_length)))

    ## write footer and output benchmark information.
    E.Stop()
Example #14
0
def filterContigs(infile, outfile, length):
    '''
    filter contigs by length
    '''
    outf = open(outfile, "w")
    for fasta in FastaIterator.iterate(IOTools.openFile(infile)):
        seq_length = len(fasta.sequence)
        if seq_length < length: continue
        outf.write(">%s\n%s\n" % (fasta.title, fasta.sequence))
    outf.close()
Example #15
0
def filterContigs(infile, outfile, length):
    '''
    filter contigs by length
    '''
    outf = open(outfile, "w")
    for fasta in FastaIterator.iterate(IOTools.openFile(infile)):
        seq_length = len(fasta.sequence)
        if seq_length < length: continue
        outf.write(">%s\n%s\n" % (fasta.title, fasta.sequence))
    outf.close()
def contig_to_stats(contigs_file, stats_file, params):
    """
    calculate descriptive stats for a set
    of contigs / scaffolds
    """

    PARAMS = params

    if PARAMS["filter"]:
        f = PARAMS["filter"]
    else:
        f = 0

    # iterate over the contigs/scaffolds and return stats
    number_of_scaffolds = 0

    N = PARAMS["scaffold_n"]
    scaffold_lengths = []

    inf = open(contigs_file)
    for record in FastaIterator.iterate(inf):
        scaffold_length = len(list(record.sequence))
        if scaffold_length >= f:
            number_of_scaffolds += 1
            scaffold_lengths.append(scaffold_length)

    # mean, median and max contig/scaffold lengths
    mean_length = np.mean(scaffold_lengths)
    median_length = np.median(scaffold_lengths)
    max_length = max(scaffold_lengths)

    # iterate over contigs/scaffolds sorted by longest
    # and caculate the NX
    index = 0
    cum_length = 0
    total_length = sum(scaffold_lengths)
    for length in sorted(scaffold_lengths, reverse=True):
        while cum_length <= total_length * (float(N) / 100):
            index += 1
            cum_length += length

    # output the results
    outf = open(stats_file, "w")
    outf.write("nscaffolds\tscaffold_length\tN%i\tmedian_length\tmean_length\tmax_length\n" % N)
    outf.write(
        "%s\t%s\t%s\t%s\t%s\t%s\n"
        % (
            number_of_scaffolds,
            total_length,
            sorted(scaffold_lengths, reverse=True)[index],
            str(median_length),
            str(mean_length),
            str(max_length),
        )
    )
Example #17
0
def build_scaffold_lengths(contigs_file, outfile, params):
    '''
    output the distribution of scaffold lengths
    '''
    inf = open(contigs_file)
    outf = open(outfile, "w")
    outf.write("scaffold_name\tlength\n")
    for record in FastaIterator.iterate(inf):
        scaffold_length = len(list(record.sequence))
        outf.write("%s\t%i\n" % (record.title, scaffold_length))
    outf.close()
def build_scaffold_lengths(contigs_file, outfile, params):
    '''
    output the distribution of scaffold lengths
    '''
    inf = open(contigs_file)
    outf = open(outfile, "w")
    outf.write("scaffold_name\tlength\n")
    for record in FastaIterator.iterate(inf):
        scaffold_length = len(list(record.sequence))
        outf.write("%s\t%i\n" % (record.title, scaffold_length))
    outf.close()
def collectGenomeSizes(infile, outfile):
    '''
    output the genome sizes for each genome
    '''
    to_cluster = True
    outf = open(outfile, "w")
    outf.write("genome\tlength\n")
    # assume single fasta entry
    for fasta in FastaIterator.iterate(IOTools.openFile(infile)):
        name = P.snip(os.path.basename(infile), ".fna")
        length = len(list(fasta.sequence))
        outf.write("%s\t%s\n" % (name, str(length)))
    outf.close()
def collectGenomeSizes(infile, outfile):
    '''
    output the genome sizes for each genome
    '''
    to_cluster = True
    outf = open(outfile, "w")
    outf.write("genome\tlength\n")
    # assume single fasta entry
    for fasta in FastaIterator.iterate(IOTools.openFile(infile)):
        name = P.snip(os.path.basename(infile), ".fna")
        length = len(list(fasta.sequence))
        outf.write("%s\t%s\n" % (name, str(length)))
    outf.close()
def contig_to_stats(contigs_file, stats_file, params):
    '''
    calculate descriptive stats for a set
    of contigs / scaffolds
    '''

    PARAMS = params

    if PARAMS["filter"]:
        f = PARAMS["filter"]
    else:
        f = 0

    # iterate over the contigs/scaffolds and return stats
    number_of_scaffolds = 0

    N = PARAMS["scaffold_n"]
    scaffold_lengths = []

    inf = open(contigs_file)
    for record in FastaIterator.iterate(inf):
        scaffold_length = len(list(record.sequence))
        if scaffold_length >= f:
            number_of_scaffolds += 1
            scaffold_lengths.append(scaffold_length)

    # mean, median and max contig/scaffold lengths
    mean_length = np.mean(scaffold_lengths)
    median_length = np.median(scaffold_lengths)
    max_length = max(scaffold_lengths)

    # iterate over contigs/scaffolds sorted by longest
    # and caculate the NX
    index = 0
    cum_length = 0
    total_length = sum(scaffold_lengths)
    for length in sorted(scaffold_lengths, reverse=True):
        while cum_length <= total_length * (float(N) / 100):
            index += 1
            cum_length += length

    # output the results
    outf = open(stats_file, "w")
    outf.write(
        "nscaffolds\tscaffold_length\tN%i\tmedian_length\tmean_length\tmax_length\n"
        % N)
    outf.write("%s\t%s\t%s\t%s\t%s\t%s\n" %
               (number_of_scaffolds, total_length,
                sorted(scaffold_lengths, reverse=True)[index],
                str(median_length), str(mean_length), str(max_length)))
Example #22
0
def buildMisprimingLib(infiles, outfile):
    '''
    build fasta file of sequences to check for mispriming
    '''
    fasta, identifiers = infiles
    inf = IOTools.openFile(fasta)

    E.info("reading ids for sequences to keep")
    ids = readIdentifiers(identifiers)

    outf = IOTools.openFile(outfile, "w")
    E.info("collecting sequences")
    for f in FastaIterator.iterate(IOTools.openFile(fasta)):
        if f.title not in ids:
            outf.write(">%s\n%s\n" % (f.title, f.sequence))
    outf.close()
Example #23
0
def buildMisprimingLib(infiles, outfile):
    '''
    build fasta file of sequences to check for mispriming
    '''
    fasta, identifiers = infiles
    inf = IOTools.openFile(fasta)
    
    E.info("reading ids for sequences to keep")
    ids = readIdentifiers(identifiers)

    outf = IOTools.openFile(outfile, "w")
    E.info("collecting sequences")
    for f in FastaIterator.iterate(IOTools.openFile(fasta)):
        if f.title not in ids:
            outf.write(">%s\n%s\n" % (f.title, f.sequence))
    outf.close()
Example #24
0
def main(argv=None):
    """script main.
    parses command line options in sys.argv, unless *argv* is given.
    """

    if argv is None:
        argv = sys.argv

    # setup command line parser
    parser = E.OptionParser(version="%prog version: $Id$",
                            usage=globals()["__doc__"])

    parser.add_option("-pm",
                      "--profilematrix",
                      dest="matrixfile",
                      type="string",
                      help="name of profile file you want to convert")

    # add common options (-h/--help, ...) and parse command line
    (options, args) = E.Start(parser, argv=argv)
    #outf = IOTools.openFile("my_output", "w")
    for line in options.matrixfile:
        line = line.strip()
        fields = line.split()
        total = sum([float(col) for col in fields[1:]])
        if total == 0:
            continue
        else:
            for i, col in enumerate(fields):
                if i == 0: continue
                fields[i] = col / total
    options.stdout.write("\t".join(map(str, fields)))

    for fasta_read in FastaIterator.iterate(IOTools.openFile(
            options.fastafile)):
        read_sequence = fasta_read.sequence
        read_name = fasta_read.title
        quals = '.' * len(read_sequence)

        new_fastq = Fastq.Record(identifier=read_name,
                                 seq=read_sequence,
                                 quals=quals)
        new_fastq.fromPhred([30] * len(read_sequence), format='illumina-1.8')
        options.stdout.write(str(new_fastq) + "\n")
    # write footer and output benchmark information.
    E.Stop()
Example #25
0
def build_scaffold_lengths(contigs_file, outfile, params):
    '''
    output the distribution of scaffold lengths
    '''
    PARAMS = params

    if PARAMS["filter"]:
        f = PARAMS["filter"]
    else:
        f = 0
    inf = open(contigs_file)
    outf = open(outfile, "w")
    outf.write("scaffold_name\tlength\n")
    for record in FastaIterator.iterate(inf):
        scaffold_length = len(list(record.sequence))
        if scaffold_length > f:
            # rename sequences if they have a space in them
            outf.write("%s\t%i\n" % (record.title.replace(" ", "_"), scaffold_length))
    outf.close()
def build_scaffold_lengths(contigs_file, outfile, params):
    '''
    output the distribution of scaffold lengths
    '''
    PARAMS = params

    if PARAMS["filter"]:
        f = PARAMS["filter"]
    else:
        f = 0
    inf = open(contigs_file)
    outf = open(outfile, "w")
    outf.write("scaffold_name\tlength\n")
    for record in FastaIterator.iterate(inf):
        scaffold_length = len(list(record.sequence))
        if scaffold_length > f:
            # rename sequences if they have a space in them
            outf.write("%s\t%i\n" %
                       (record.title.replace(" ", "_"), scaffold_length))
    outf.close()
Example #27
0
def main(argv=None):
    """script main.

    parses command line options in sys.argv, unless *argv* is given.
    """

    if not argv:
        argv = sys.argv

    # setup command line parser
    parser = optparse.OptionParser(
        version=
        "%prog version: $Id: script_template.py 2871 2010-03-03 10:20:44Z andreas $",
        usage=globals()["__doc__"])

    parser.add_option("-g",
                      "--genome-dir",
                      dest="genome_dir",
                      type="string",
                      help="supply help")

    # add common options (-h/--help, ...) and parse command line
    (options, args) = E.Start(parser, argv=argv)

    contigs_map = {}
    for genome in glob.glob(os.path.join(options.genome_dir, "*")):
        for fasta in FastaIterator.iterate(IOTools.openFile(genome)):
            identifier = fasta.title.split("|")
            gi = identifier[1]
            contigs_map[gi] = fasta.title

    for line in options.stdin.readlines():
        data = line[:-1].split("\t")
        gi = data[1]
        assert gi in contigs_map, "cannot find genome with id gi|%s in genomes directory" % gi

        options.stdout.write("%s\t%s\n" % (data[0], contigs_map[gi]))

    # write footer and output benchmark information.
    E.Stop()
def countCompleteGenes(infile, outfile):
    '''
    count the number of genes that are classed
    as complete based on having a start and stop codon
    '''
    start = "ATG"
    stop = ["TAG", "TAA", "TGA"]

    ntotal = 0
    nstart = 0
    nstop = 0
    nstart_nstop = 0
    for fasta in FastaIterator.iterate(IOTools.openFile(infile)):
        ntotal += 1
        if fasta.sequence.startswith(start):
            nstart += 1
        if fasta.sequence[-3:len(fasta.sequence)] in stop:
            nstop += 1
        if fasta.sequence.startswith(start) and fasta.sequence[-3:len(fasta.sequence)] in stop:
            nstart_nstop += 1
    outf = open(outfile, "w")
    outf.write("total_genes\tpstart\tpstop\tpstart_stop\n")
    outf.write("\t".join(map(str,[ntotal, float(nstart)/ntotal, float(nstop)/ntotal, float(nstart_nstop)/ntotal])) + "\n")
    outf.close()
Example #29
0
def main(argv=None):
    """script main.

    parses command line options in sys.argv, unless *argv* is given.
    """

    if not argv:
        argv = sys.argv

    # setup command line parser
    parser = E.OptionParser(version="%prog version: $Id$", usage=globals()["__doc__"])

    parser.add_option("-k", "--kmer-size", dest="kmer", type="int", help="supply kmer length")

    parser.add_option(
        "-p",
        "--output-proportion",
        dest="proportion",
        action="store_true",
        help="output proportions - overides the default output",
    )

    # add common options (-h/--help, ...) and parse command line
    (options, args) = E.Start(parser, argv=argv)

    # do not allow greater than octonucleotide
    assert options.kmer <= 8, "cannot handle kmer of length %i" % options.kmer

    # how we deal with the nucleotides depends on the kmer length
    nucleotides = []
    for nucleotide in ["A", "C", "T", "G"]:
        nucleotides = nucleotides + [x for x in itertools.repeat(nucleotide, options.kmer)]

    E.info("retrieving %imer sequences" % options.kmer)
    # get all kmer sequences to query
    kmers = set()
    for kmer in itertools.permutations(nucleotides, options.kmer):
        kmers.add(kmer)

    E.info("matching %imers in file" % options.kmer)
    # count the number of kmers in each sequence

    result = {}

    # NB assume that non fasta files are caught by FastaIterator
    total_entries = 0
    for fasta in FastaIterator.iterate(options.stdin):
        total_entries += 1
        result[fasta.title] = {}
        for kmer in kmers:
            counts = [m.start() for m in re.finditer("".join(kmer), fasta.sequence)]
            result[fasta.title][kmer] = len(counts)

    E.info("writing results")
    # write out the results
    headers = result.keys()
    rows = set()
    for kmer_counts in result.values():
        for kmer, count in kmer_counts.iteritems():
            rows.add("".join(kmer))

    # write header row
    options.stdout.write("kmer\t" + "\t".join(headers) + "\n")

    # output proportions if required - normalises by
    # sequence length
    E.info("computing total counts")
    totals = {}
    for header in headers:
        totals[header] = sum([result[header][tuple(row)] for row in rows])

    for row in rows:
        if options.proportion:
            options.stdout.write(
                "\t".join([row] + [str(float(result[header][tuple(row)]) / totals[header]) for header in headers])
                + "\n"
            )
        else:
            options.stdout.write("\t".join([row] + [str(result[header][tuple(row)]) for header in headers]) + "\n")

    E.info("written kmer counts for %i contigs" % total_entries)
    # write footer and output benchmark information.
    E.Stop()
Example #30
0
def findTATABox(infiles, outfile):
    '''find TATA box in promotors. There are several matrices to choose from:

    M00216 V$TATA_C Retroviral TATA box
    M00252 V$TATA_01 cellular and viral TATA box elements
    M00311 V$ATATA_B Avian C-type TATA box
    M00320 V$MTATA_B Muscle TATA box
    '''

    # 1. create fasta file - look for TATA box
    #
    bedfile, genomefile = infiles

    statement = '''
    slopBed -i %(bedfile)s
            -l %(tata_search_upstream)i
            -r %(tata_search_downstream)i
            -s
            -g %(genomefile)s
    | cgat bed2fasta 
       --use-strand
       --genome=%(genome_dir)s/%(genome)s
       --log=%(outfile)s.log
    > %(outfile)s.fasta
    '''

    P.run()

    match_executable = '/ifs/data/biobase/transfac/match/bin/match_linux64'
    match_matrix = '/ifs/data/biobase/transfac/dat/matrix.dat'
    match_profile = 'minFP_good.prf'
    match_profile = outfile + ".prf"

    prf = '''tata.prf
prf to minimize sum of both errors - derived from minSUM.prf
 MIN_LENGTH 300
0.0
 1.000 0.716 0.780 M00216 V$TATA_C
 1.000 0.738 0.856 M00252 V$TATA_01
 1.000 0.717 0.934 M00311 V$ATATA_B
 1.000 0.711 0.784 M00320 V$MTATA_B
//
'''

    with IOTools.openFile(match_profile, "w") as outf:
        outf.write(prf)

    # -u : uniq - only one best match per sequence
    statement = '''
         %(match_executable)s
         %(match_matrix)s
         %(outfile)s.fasta
         %(outfile)s.match
         %(match_profile)s
         -u
    >> %(outfile)s.log
    '''
    P.run()

    transcript2pos = {}
    for entry in FastaIterator.iterate(IOTools.openFile(outfile + ".fasta")):
        transcript_id, contig, start, end, strand = re.match(
            "(\S+)\s+(\S+):(\d+)..(\d+)\s+\((\S)\)", entry.title).groups()
        transcript2pos[transcript_id] = (contig, int(start), int(end), strand)

    MATCH = collections.namedtuple(
        "MATCH",
        "pid transfac_id pos strand core_similarity matrix_similarity sequence"
    )

    def _grouper(infile):
        r = []
        keep = False
        for line in infile:
            if line.startswith("Inspecting sequence ID"):
                keep = True
                if r:
                    yield pid, r
                r = []
                pid = re.match("Inspecting sequence ID\s+(\S+)",
                               line).groups()[0]
                continue
            elif line.startswith(" Total"):
                break

            if not keep:
                continue
            if line[:-1].strip() == "":
                continue
            transfac_id, v, core_similarity, matrix_similarity, sequence = [
                x.strip() for x in line[:-1].split("|")
            ]
            pos, strand = re.match("(\d+) \((\S)\)", v).groups()
            r.append(
                MATCH._make((pid, transfac_id, int(pos), strand,
                             float(core_similarity), float(matrix_similarity),
                             sequence)))

        yield pid, r

    offset = PARAMS["tata_search_upstream"]

    outf = IOTools.openFile(outfile + ".table.gz", "w")
    outf.write("\t".join(("transcript_id", "strand", "start", "end",
                          "relative_start", "relative_end", "transfac_id",
                          "core_similarity", "matrix_similarity",
                          "sequence")) + "\n")

    bedf = IOTools.openFile(outfile, "w")

    c = E.Counter()
    found = set()
    for transcript_id, matches in _grouper(IOTools.openFile(outfile +
                                                            ".match")):
        contig, seq_start, seq_end, strand = transcript2pos[transcript_id]
        c.promotor_with_matches += 1
        nmatches = 0
        found.add(transcript_id)
        for match in matches:

            c.matches_total += 1
            lmatch = len(match.sequence)
            if match.strand == "-":
                c.matches_wrong_strand += 1
                continue

            # get genomic location of match
            if strand == "+":
                genome_start = seq_start + match.pos
            else:
                genome_start = seq_end - match.pos - lmatch

            genome_end = genome_start + lmatch

            # get relative location of match
            if strand == "+":
                tss_start = seq_start + offset
                relative_start = genome_start - tss_start
            else:
                tss_start = seq_end - offset
                relative_start = tss_start - genome_end

            relative_end = relative_start + lmatch

            outf.write("\t".join(
                map(str, (transcript_id, strand, genome_start, genome_end,
                          relative_start, relative_end, match.transfac_id,
                          match.core_similarity, match.matrix_similarity,
                          match.sequence))) + "\n")
            c.matches_output += 1
            nmatches += 1

            bedf.write("\t".join(
                map(str, (contig, genome_start, genome_end, transcript_id,
                          strand, match.matrix_similarity))) + "\n")

        if nmatches == 0:
            c.promotor_filtered += 1
        else:
            c.promotor_output += 1

    c.promotor_total = len(transcript2pos)
    c.promotor_without_matches = len(
        set(transcript2pos.keys()).difference(found))

    outf.close()
    bedf.close()

    with IOTools.openFile(outfile + ".summary", "w") as outf:
        outf.write("category\tcounts\n")
        outf.write(c.asTable() + "\n")

    E.info(c)
Example #31
0
def main(argv=None):

    if argv is None:
        argv = sys.argv

    parser = E.OptionParser(version="%prog version: $Id$",
                            usage=globals()["__doc__"])

    parser.add_option("-s",
                      "--correct-gap-shift",
                      dest="correct_shift",
                      action="store_true",
                      help="correct gap length shifts in alignments. "
                      "Requires alignlib_lite.py [%default]")

    parser.add_option(
        "-1",
        "--pattern1",
        dest="pattern1",
        type="string",
        help="pattern to extract identifier from in identifiers1. "
        "[%default]")

    parser.add_option(
        "-2",
        "--pattern2",
        dest="pattern2",
        type="string",
        help="pattern to extract identifier from in identifiers2. "
        "[%default]")

    parser.add_option("-o",
                      "--output-section",
                      dest="output",
                      type="choice",
                      action="append",
                      choices=("diff", "missed", "seqdiff"),
                      help="what to output [%default]")

    parser.set_defaults(correct_shift=False,
                        pattern1="(\S+)",
                        pattern2="(\S+)",
                        output=[])

    (options, args) = E.Start(parser)

    if len(args) != 2:
        raise ValueError("two files needed to compare.")

    if options.correct_shift:
        try:
            import alignlib_lite
        except ImportError:
            raise ImportError(
                "option --correct-shift requires alignlib_lite.py_ "
                "but alignlib not found")

    seqs1 = dict([
        (x.title, x.sequence)
        for x in FastaIterator.iterate(IOTools.openFile(args[0], "r"))
    ])
    seqs2 = dict([
        (x.title, x.sequence)
        for x in FastaIterator.iterate(IOTools.openFile(args[1], "r"))
    ])

    if not seqs1:
        raise ValueError("first file %s is empty." % (args[0]))
    if not seqs2:
        raise ValueError("second file %s is empty." % (args[1]))

    MapIdentifiers(seqs1, options.pattern1)
    MapIdentifiers(seqs2, options.pattern2)

    nsame = 0
    nmissed1 = 0
    nmissed2 = 0
    ndiff = 0
    ndiff_first = 0
    ndiff_last = 0
    ndiff_prefix = 0
    ndiff_selenocysteine = 0
    ndiff_masked = 0
    nfixed = 0
    found2 = {}

    write_missed1 = "missed" in options.output
    write_missed2 = "missed" in options.output
    write_seqdiff = "seqdiff" in options.output
    write_diff = "diff" in options.output or write_seqdiff

    for k in seqs1:
        if k not in seqs2:
            nmissed1 += 1
            if write_missed1:
                options.stdout.write("---- %s ---- %s\n" % (k, "missed1"))
            continue

        found2[k] = 1

        s1 = seqs1[k].upper()
        s2 = seqs2[k].upper()
        m = min(len(s1), len(s2))

        if s1 == s2:
            nsame += 1
        else:
            status = "other"

            ndiff += 1

            if s1[1:] == s2[1:]:
                ndiff_first += 1
                status = "first"
            elif s1[:m] == s2[:m]:
                ndiff_prefix += 1
                status = "prefix"
            elif s1[:-1] == s2[:-1]:
                ndiff_last += 1
                status = "last"
            else:
                if len(s1) == len(s2):
                    # get all differences: the first and last residues
                    # can be different for peptide sequences when
                    # comparing my translations with ensembl peptides.
                    differences = []
                    for x in range(1, len(s1) - 1):
                        if s1[x] != s2[x]:
                            differences.append((s1[x], s2[x]))

                    l = len(differences)
                    # check for Selenocysteins
                    if len(
                            filter(lambda x: x[0] == "U" or x[1] == "U",
                                   differences)) == l:
                        ndiff_selenocysteine += 1
                        status = "selenocysteine"

                    # check for masked residues
                    elif len(
                            filter(lambda x: x[0] in "NX" or x[1] in "NX",
                                   differences)) == l:
                        ndiff_masked += 1
                        status = "masked"

            # correct for different gap lengths
            if options.correct_shift:

                map_a2b = alignlib_lite.py_makeAlignmentVector()

                a, b = 0, 0
                keep = False

                x = 0
                while x < m and not (a == len(s1) and b == len(s2)):
                    try:
                        if s1[a] != s2[b]:
                            while s1[a] == "N" and s2[b] != "N":
                                a += 1
                            while s1[a] != "N" and s2[b] == "N":
                                b += 1

                            if s1[a] != s2[b]:
                                break
                    except IndexError:
                        print "# index error for %s: x=%i, a=%i, b=%i, l1=%i, l2=%i" % (
                            k, x, a, b, len(s1), len(s2))
                        break

                    a += 1
                    b += 1
                    map_a2b.addPairExplicit(a, b, 0.0)
                    # check if we have reached the end:
                else:
                    keep = True
                    nfixed += 1
                    f = alignlib_lite.py_AlignmentFormatEmissions(map_a2b)
                    print "fix\t%s\t%s" % (k, str(f))

                if not keep:
                    print "# warning: not fixable: %s" % k

            if write_diff:
                options.stdout.write("---- %s ---- %s\n" % (k, status))

            if write_seqdiff:
                options.stdout.write("< %s\n> %s\n" % (seqs1[k], seqs2[k]))

    for k in seqs2.keys():
        if k not in found2:
            nmissed2 += 1
            if write_missed2:
                options.stdout.write("---- %s ---- %s\n" % (k, "missed2"))

    options.stdlog.write("""# Legend:
# seqs1:          number of sequences in set 1
# seqs2:          number of sequences in set 2
# same:           number of identical sequences
# diff:           number of sequences with differences
# nmissed1:       sequences in set 1 that are not found in set 2
# nmissed2:       sequences in set 2 that are not found in set 1
# Type of sequence differences
# first:          only the first residue is different
# last:           only the last residue is different
# prefix:         one sequence is prefix of the other
# selenocysteine: difference due to selenocysteines
# masked:         difference due to masked residues
# fixed:          fixed differences
# other:          other differences
""")

    E.info("seqs1=%i, seqs2=%i, same=%i, ndiff=%i, nmissed1=%i, nmissed2=%i" %
           (len(seqs1), len(seqs2), nsame, ndiff, nmissed1, nmissed2))

    E.info(
        "ndiff=%i: first=%i, last=%i, prefix=%i, selenocysteine=%i, masked=%i, fixed=%i, other=%i"
        % (ndiff, ndiff_first, ndiff_last, ndiff_prefix, ndiff_selenocysteine,
           ndiff_masked, nfixed, ndiff - ndiff_first - ndiff_last -
           ndiff_prefix - ndiff_selenocysteine - ndiff_masked - nfixed))

    E.Stop()
Example #32
0
    parser.set_defaults(
        input_format="fasta",
        output_format="fasta",
        method = None,
        parameters = "",
        gop = -10.0,
        gep = -1.0,
        alignment_method = "sw",
        )

    (options, args) = E.Start( parser )

    options.parameters = options.parameters.split(",")    

    iterator = FastaIterator.iterate( sys.stdin )

    if options.method == "add":
        
        mali = Mali.Mali()

        mali.readFromFile( open(options.parameters[0], "r"), format = options.input_format )
        del options.parameters[0]

        old_length = mali.getLength()
        
        new_mali = convertMali2Mali( mali )

        if options.alignment_method == "sw":
            alignator = alignlib.makeAlignatorFullDP( options.gop, options.gep )
        else:
def main( argv = None ):

    parser = E.OptionParser( version = "%prog version: $Id: analyze_codonbias_shannon.py 2864 2010-03-03 10:18:16Z andreas $",
                                    usage = globals()["__doc__"] )

    parser.add_option( "-c", "--is-cds", dest="is_cds", action="store_true",
                       help = "input are cds (nucleotide) sequences [%default]" )
    
    parser.set_defaults(
        is_cds = False,
        )
    
    (options, args) = E.Start( parser, argv = argv )

    options.stdout.write( "snpid\tidentifier\tpos\treference\tvariant\tcounts\tweight\n" )

    alphabet = "ACDEFGHIKLMNPQRSTVWY"
    
    snpid = 0

    for entry in FastaIterator.iterate( options.stdin ):
        identifier = entry.title

        if options.is_cds:
            cds_sequence = entry.sequence.upper()
            assert len(cds_sequence) % 3 == 0, \
                "length of sequence '%s' is not a multiple of 3" % entry.title

            sequence = Genomics.translate( cds_sequence )
            weights = []
            for pos, cds_pos in enumerate(range( 0, len(cds_sequence), 3)):
                codon = cds_sequence[cds_pos:cds_pos+3]
                counts = collections.defaultdict(int)
                for x in range(0,3):
                    rna = codon[x]
                    for na in "ACGT":
                        if na == rna: continue
                        taa = Genomics.translate(codon[:x] + na + codon[x+1:])
                        counts[taa] += 1
                weights.append( counts )

        else:
            sequence = entry.sequence.upper()
            counts = {}
            for x in alphabet: counts[x] = 1
            weights = [counts] * len(sequence)

        for pos, ref in enumerate( sequence ):

            if ref not in alphabet: continue
            w = weights[pos]
            t = float(sum(w.values()))
            for variant in alphabet:
                if variant == ref: continue
                snpid +=1
                options.stdout.write( 
                    "%s\n" % "\t".join(
                        ( "%010i" % snpid,
                          identifier,
                          str(pos+1),
                          ref, 
                          variant,
                          "%i" % w[variant],
                          "%6.4f" % (w[variant] / t),
                          )))
    
    E.Stop()
Example #34
0
def main(argv=None):
    """script main.

    parses command line options in sys.argv, unless *argv* is given.
    """

    if not argv:
        argv = sys.argv

    # setup command line parser
    parser = E.OptionParser(version="%prog version: $Id$",
                            usage=globals()["__doc__"])

    parser.add_option("-k", "--kmer-size", dest="kmer", type="int",
                      help="supply kmer length")

    parser.add_option(
        "-p", "--output-proportion", dest="proportion", action="store_true",
        help="output proportions - overides the default output")

    # add common options (-h/--help, ...) and parse command line
    (options, args) = E.Start(parser, argv=argv)

    # do not allow greater than octonucleotide
    assert options.kmer <= 8, "cannot handle kmer of length %i" % options.kmer

    # how we deal with the nucleotides depends on the kmer length
    nucleotides = []
    for nucleotide in ["A", "C", "T", "G"]:
        nucleotides = nucleotides + \
            [x for x in itertools.repeat(nucleotide, options.kmer)]

    E.info("retrieving %imer sequences" % options.kmer)
    # get all kmer sequences to query
    kmers = set()
    for kmer in itertools.permutations(nucleotides, options.kmer):
        kmers.add(kmer)

    E.info("matching %imers in file" % options.kmer)
    # count the number of kmers in each sequence

    result = {}

    # NB assume that non fasta files are caught by FastaIterator
    total_entries = 0
    for fasta in FastaIterator.iterate(options.stdin):
        total_entries += 1
        result[fasta.title] = {}
        for kmer in kmers:
            counts = [m.start()
                      for m in re.finditer("".join(kmer), fasta.sequence)]
            result[fasta.title][kmer] = len(counts)

    E.info("writing results")
    # write out the results
    headers = sorted(result.keys())
    rows = set()
    for kmer_counts in list(result.values()):
        for kmer, count in kmer_counts.items():
            rows.add("".join(kmer))

    # write header row
    options.stdout.write("kmer\t" + "\t".join(headers) + "\n")

    # output proportions if required - normalises by
    # sequence length
    E.info("computing total counts")
    totals = {}
    for header in headers:
        totals[header] = sum([result[header][tuple(row)] for row in rows])

    for row in sorted(rows):
        if options.proportion:
            options.stdout.write("\t".join(
                [row] + [str(float(result[header][tuple(row)]) / totals[header]) for header in headers]) + "\n")
        else:
            options.stdout.write(
                "\t".join([row] + [str(result[header][tuple(row)]) for header in headers]) + "\n")

    E.info("written kmer counts for %i contigs" % total_entries)
    # write footer and output benchmark information.
    E.Stop()
Example #35
0
def main(argv=None):
    """script main.

    parses command line options in sys.argv, unless *argv* is given.
    """

    if not argv:
        argv = sys.argv

    # setup command line parser
    parser = E.OptionParser(
        version=
        "%prog version: $Id: run_nubiscan.py 2861 2010-02-23 17:36:32Z andreas $",
        usage=globals()["__doc__"])

    parser.add_option(
        "-i",
        "--iterations",
        dest="iterations",
        type="int",
        help="number of iterations for sampling [default=%default].")

    parser.add_option("-q",
                      "--qvalue",
                      dest="qvalue_threshold",
                      type="float",
                      help="qvalue threshold [default=%default].")

    parser.add_option("--without-combine",
                      dest="combine",
                      action="store_false",
                      help="combine overlapping motifs [default=%default].")

    parser.add_option("-f",
                      "--fdr-control",
                      dest="fdr_control",
                      type="choice",
                      choices=("per-sequence", "all", "xall"),
                      help="qvalue threshold [default=%default].")

    parser.add_option("-m",
                      "--motif",
                      dest="motif",
                      type="choice",
                      choices=("rxrvdr", "rxrvdr1", "rxrvdr2", "nr"),
                      help="qvalue threshold [default=%default].")

    parser.add_option(
        "-a",
        "--arrangements",
        dest="arrangements",
        type="string",
        help="',' separated list of repeat arrangements [default=%default]")

    parser.add_option("-x",
                      "--mask",
                      dest="mask",
                      type="choice",
                      choices=("dust", "repeatmasker"),
                      help="mask sequences before scanning [default=%default]")

    parser.add_option("--output-stats",
                      dest="output_stats",
                      action="store_true",
                      help="output stats [default=%default].")

    parser.add_option("--add-sequence",
                      dest="add_sequence",
                      action="store_true",
                      help="add sequence information [default=%default].")

    parser.set_defaults(
        iterations=100,
        qvalue_threshold=0.05,
        motif="rxrvdr",
        fdr_control="all",
        combine=True,
        arrangements=None,
        mask=None,
        output_stats=False,
        add_sequence=False,
    )

    # add common options (-h/--help, ...) and parse command line
    (options, args) = E.Start(parser, argv=argv, add_output_options=True)

    # do sth
    ninput, nskipped, noutput = 0, 0, 0

    if options.arrangements is None:
        options.arrangements = ["DR%s" % x for x in range(0, 15)
                                ] + ["ER%s" % x for x in range(0, 15)]
    else:
        options.arrangements = options.arrangements.split(",")

    options.stdout.write("%s" % "\t".join(Nubiscan.NubiscanMatch._fields))
    if options.add_sequence:
        options.stdout.write("\tsequence")
    options.stdout.write("\n")

    if options.motif == 'nr':
        sense_matrix = NR
    elif options.motif == "rxrvdr":
        sense_matrix = RXRVDR
    elif options.motif == "rxrvdr1":
        sense_matrix = RXRVDR1
    elif options.motif == "rxrvdr2":
        sense_matrix = RXRVDR2
    else:
        raise ValueError("unknown matrix %s" % options.motif)

    if options.fdr_control == "all":

        seqs = list(FastaIterator.iterate(options.stdin))

        if options.mask:
            masked_seqs = maskSequences([x.sequence for x in seqs],
                                        options.mask)
        else:
            masked_seqs = [x.sequence for x in seqs]

        ninput = len(seqs)
        map_id2title = dict(
            enumerate([re.sub("\s.*", "", x.title) for x in seqs]))
        matcher = Nubiscan.MatcherRandomisationSequences(
            sense_matrix, samples=options.iterations)

        results = matcher.run(masked_seqs,
                              options.arrangements,
                              qvalue_threshold=options.qvalue_threshold)

        if options.combine:
            results = Nubiscan.combineMotifs(results)

        for r in results:

            if r.alternatives:
                alternatives = ",".join(
                    [x.arrangement for x in r.alternatives])
            else:
                alternatives = ""

            options.stdout.write("\t".join(
                (map_id2title[r.id], "%i" % r.start, "%i" % r.end, r.strand,
                 r.arrangement, "%6.4f" % r.score, "%6.4f" % r.zscore,
                 "%6.4e" % r.pvalue, "%6.4e" % r.qvalue, alternatives)))

            if options.add_sequence:
                s = masked_seqs[int(r.id)][r.start:r.end]
                if r.strand == "-":
                    s = Genomics.complement(s)
                s = s[:6].upper() + s[6:-6].lower() + s[-6:].upper()
                options.stdout.write("\t%s" % s)

            options.stdout.write("\n")
            noutput += 1

        # output stats
        if options.output_stats:
            outfile = E.openOutputFile("fdr")
            outfile.write("bin\thist\tnobserved\n")
            for bin, hist, nobs in zip(matcher.bin_edges, matcher.hist,
                                       matcher.nobservations):
                outfile.write("%f\t%f\t%f\n" % (bin, hist, nobs))
            outfile.close()

    elif options.fdr_control == "xall":

        matcher = Nubiscan.MatcherRandomisationSequence(
            sense_matrix, samples=options.iterations)

        # collect all results
        matches = []
        for seq in FastaIterator.iterate(options.stdin):
            ninput += 1
            mm = matcher.run(seq.sequence,
                             options.arrangements,
                             qvalue_threshold=None)
            for m in mm:
                matches.append(m._replace(sequence=seq.title))

        # estimate qvalues for all matches across all sequences
        pvalues = [x.pvalue for x in matches]
        fdr = Stats.doFDR(pvalues)
        qvalues = fdr.mQValues
        results = []
        for m, qvalue in zip(matches, qvalues):
            if qvalue > options.qvalue_threshold:
                continue
            results.append(m._replace(qvalue=qvalue))

        if options.combine:
            results = Nubiscan.combineMotifs(results)

        # output
        for r in results:
            options.stdout.write("\t".join(
                (r.id, "%i" % r.start, "%i" % r.end, r.strand, r.arrangement,
                 "%6.4f" % r.score, "%6.4f" % r.zscore, "%6.4e" % r.pvalue,
                 "%6.4e" % r.qvalue)) + "\n")

            noutput += 1

    elif options.fdr_control == "per-sequence":
        matcher = Nubiscan.MatcherRandomisationSequence(
            sense_matrix, samples=options.iterations)

        for seq in FastaIterator.iterate(options.stdin):
            ninput += 1
            result = matcher.run(seq.sequence,
                                 options.arrangements,
                                 qvalue_threshold=options.qvalue_threshold)

            if options.combine:
                result = Nubiscan.combineMotifs(result)

            t = re.sub(" .*", "", seq.title)
            for r in result:
                options.stdout.write("\t".join(
                    (t, "%i" % r.start, "%i" % r.end, r.strand, r.arrangement,
                     "%6.4f" % r.score, "%6.4f" % r.zscore, "%f" % r.pvalue,
                     "%f" % r.qvalue)) + "\n")

            noutput += 1

    E.info("ninput=%i, noutput=%i, nskipped=%i" % (ninput, noutput, nskipped))

    # write footer and output benchmark information.
    E.Stop()
def main( argv = None ):
    """script main.

    parses command line options in sys.argv, unless *argv* is given.
    """

    if not argv: argv = sys.argv

    # setup command line parser
    parser = E.OptionParser( version = "%prog version: $Id: run_nubiscan.py 2861 2010-02-23 17:36:32Z andreas $", usage = globals()["__doc__"] )

    parser.add_option("-i", "--iterations", dest="iterations", type="int",
                      help="number of iterations for sampling [default=%default]."  )

    parser.add_option("-q", "--qvalue", dest="qvalue_threshold", type="float",
                      help="qvalue threshold [default=%default]."  )

    parser.add_option("--without-combine", dest="combine", action = "store_false",
                      help="combine overlapping motifs [default=%default]."  )

    parser.add_option("-f", "--fdr-control", dest="fdr_control", type="choice",
                      choices = ("per-sequence", "all", "xall"),
                      help="qvalue threshold [default=%default]."  )

    parser.add_option("-m", "--motif", dest="motif", type="choice",
                      choices=("rxrvdr", "rxrvdr1", "rxrvdr2", "nr"),
                      help="qvalue threshold [default=%default]."  )

    parser.add_option("-a", "--arrangements", dest="arrangements", type="string",
                      help ="',' separated list of repeat arrangements [default=%default]")

    parser.add_option("-x", "--mask", dest="mask", type="choice",
                      choices=("dust","repeatmasker"),
                      help ="mask sequences before scanning [default=%default]")

    parser.add_option("--output-stats", dest="output_stats", action = "store_true",
                      help="output stats [default=%default]."  )

    parser.add_option("--add-sequence", dest="add_sequence", action = "store_true",
                      help="add sequence information [default=%default]."  )

    parser.set_defaults(
        iterations = 100,
        qvalue_threshold = 0.05,
        motif = "rxrvdr",
        fdr_control = "all",
        combine = True,
        arrangements = None,
        mask = None,
        output_stats = False,
        add_sequence = False,
        )

    ## add common options (-h/--help, ...) and parse command line 
    (options, args) = E.Start( parser, argv = argv, add_output_options = True )

    ## do sth
    ninput, nskipped, noutput = 0, 0, 0

    if options.arrangements == None:
        options.arrangements = [ "DR%s" % x for x in range(0,15) ] + [ "ER%s" % x for x in range(0,15) ]
    else:
        options.arrangements = options.arrangements.split(",")
        
    options.stdout.write( "%s" % "\t".join(Nubiscan.NubiscanMatch._fields) )
    if options.add_sequence: options.stdout.write( "\tsequence" )
    options.stdout.write("\n")

    if options.motif == 'nr': sense_matrix = NR
    elif options.motif == "rxrvdr": sense_matrix = RXRVDR
    elif options.motif == "rxrvdr1": sense_matrix = RXRVDR1
    elif options.motif == "rxrvdr2": sense_matrix = RXRVDR2
    else:
        raise ValueError("unknown matrix %s" % options.motif)

    if options.fdr_control == "all":

        seqs = list(FastaIterator.iterate(options.stdin))

        if options.mask:
            masked_seqs = maskSequences( [x.sequence for x in seqs], options.mask )
        else:
            masked_seqs = [x.sequence for x in seqs]
            
        ninput = len(seqs)
        map_id2title = dict( enumerate( [re.sub("\s.*", "", x.title) for x in seqs] ) )
        matcher = Nubiscan.MatcherRandomisationSequences( sense_matrix,
                                                          samples = options.iterations )
        
        results = matcher.run( masked_seqs,
                               options.arrangements,
                               qvalue_threshold = options.qvalue_threshold )

        if options.combine:
            results =  Nubiscan.combineMotifs( results )
        
        for r in results:

            if r.alternatives:
                alternatives = ",".join( [x.arrangement for x in r.alternatives ] )
            else:
                alternatives = ""

            options.stdout.write( "\t".join( ( 
                map_id2title[r.id],
                "%i" % r.start,
                "%i" % r.end,
                r.strand,
                r.arrangement,
                "%6.4f" % r.score,
                "%6.4f" % r.zscore,
                "%6.4e" % r.pvalue,
                "%6.4e" % r.qvalue,
                alternatives) ) )

            if options.add_sequence:
                s = masked_seqs[int(r.id)][r.start:r.end]
                if r.strand == "-": s = Genomics.complement( s )
                s = s[:6].upper() + s[6:-6].lower() + s[-6:].upper()
                options.stdout.write( "\t%s" % s )
                
            options.stdout.write("\n")
            noutput += 1

        # output stats
        if options.output_stats:
            outfile = E.openOutputFile( "fdr" )
            outfile.write("bin\thist\tnobserved\n" )
            for bin, hist, nobs in zip(matcher.bin_edges, matcher.hist, matcher.nobservations):
                outfile.write( "%f\t%f\t%f\n" % (bin, hist, nobs))
            outfile.close()


    elif options.fdr_control == "xall":

        matcher = Nubiscan.MatcherRandomisationSequence( sense_matrix,
                                                         samples = options.iterations )
    

        # collect all results
        matches = []
        for seq in FastaIterator.iterate(options.stdin):
            ninput += 1
            mm = matcher.run( seq.sequence,
                              options.arrangements,
                              qvalue_threshold = None )
            for m in mm:
                matches.append( m._replace( sequence = seq.title ) )

        # estimate qvalues for all matches across all sequences
        pvalues = [ x.pvalue for x in matches ]
        fdr = Stats.doFDR( pvalues )
        qvalues = fdr.mQValues
        results = []
        for m, qvalue in zip(matches, qvalues):
            if qvalue > options.qvalue_threshold: continue
            results.append( m._replace( qvalue = qvalue ) )

        if options.combine:            
            results =  Nubiscan.combineMotifs( results )

        # output
        for r in results:
            options.stdout.write( "\t".join( ( 
                r.id,
                "%i" % r.start,
                "%i" % r.end,
                r.strand,
                r.arrangement,
                "%6.4f" % r.score,
                "%6.4f" % r.zscore,
                "%6.4e" % r.pvalue,
                "%6.4e" % r.qvalue ) ) + "\n" )

            noutput += 1

    elif options.fdr_control == "per-sequence":
        matcher = Nubiscan.MatcherRandomisationSequence( sense_matrix,
                                                         samples = options.iterations )
    

        for seq in FastaIterator.iterate(options.stdin):
            ninput += 1
            result = matcher.run( seq.sequence,
                                  options.arrangements,
                                  qvalue_threshold = options.qvalue_threshold )
            
            if options.combine:
                result =  Nubiscan.combineMotifs( result )

            t = re.sub(" .*","",  seq.title)
            for r in result:
                options.stdout.write( "\t".join( ( 
                    t,
                    "%i" % r.start,
                    "%i" % r.end,
                    r.strand,
                    r.arrangement,
                    "%6.4f" % r.score,
                    "%6.4f" % r.zscore,
                    "%f" % r.pvalue,
                    "%f" % r.qvalue ) ) + "\n" )

            noutput += 1
    
    E.info( "ninput=%i, noutput=%i, nskipped=%i" % (ninput, noutput,nskipped) )

    ## write footer and output benchmark information.
    E.Stop()
Example #37
0
def main(argv=None):
    """script main.

    parses command line options in sys.argv, unless *argv* is given.
    """

    if not argv:
        argv = sys.argv

    # setup command line parser
    parser = E.OptionParser(version="%prog version: $Id$",
                            usage=globals()["__doc__"])

    parser.add_option("--input-fasta",
                      dest="fasta",
                      type="str",
                      help="name of fasta infile")

    parser.add_option("--method",
                      dest="method",
                      type="choice",
                      choices=("transcript", "gene"),
                      help="count unique kmers per transcript or gene")

    parser.add_option("--genemap",
                      dest="genemap",
                      type="str",
                      help="file mapping transcripts to genes")

    parser.add_option("-k",
                      "--kmer-size",
                      dest="kmer",
                      type="int",
                      help="supply kmer length")

    parser.add_option("--subset",
                      dest="subset",
                      type="int",
                      help="only analyse the first x entries")

    parser.set_defaults(fasta=None,
                        method="transcript",
                        genemap=None,
                        kmer=10,
                        subset=None)

    (options, args) = E.Start(parser)

    E.info("%s\n" % using("start"))

    assert options.fasta, "must provide a fasta filename (--input-fasta=)"

    k = KmerCounter()

    Iterator = FastaIterator.iterate(IOTools.openFile(options.fasta))

    # total entries also acts as the index for the entry_id
    total_entries = 0

    options.stdout.write("%s\n" % "\t".join(
        ("id", "unique_kmers", "non_unique_kmers", "fraction_unique")))

    # iterate fasta entries, shred and identify kmers

    if options.method == 'gene':
        E.info("shredding genes to identify unique kmers")

        assert options.genemap, (
            "to perform a gene-level unique kmer count, "
            "you must supply a transcript2gene map (--genemap)")
        t2g = {}
        with IOTools.openFile(options.genemap, "r") as inf:
            for line in inf:
                transcript, gene = line.strip().split("\t")
                t2g[transcript] = gene

        genes = set()
        current_gene = None
        sequences = []

        for entry in Iterator:

            if options.subset and total_entries >= options.subset:
                break

            transcript_id = entry.title.split()[0]
            gene_id = t2g[transcript_id]

            if gene_id != current_gene:
                if not current_gene:
                    current_gene = gene_id
                    continue

                # check this is the first time we've dealt with this gene?
                assert current_gene not in genes, (
                    "the fasta does not appear to be sorted in gene order, the"
                    " same gene is observed in non-consecutive positions!")

                genes.add(current_gene)

                k.shred(sequences, options.kmer)

                if total_entries % 1000 == 0:
                    E.info("1st shred complete for %i genes" % total_entries)

                total_entries += 1

                sequences = [entry.sequence.upper()]
                current_gene = gene_id

            else:
                sequences.append(entry.sequence.upper())

        # catch last gene
        if not options.subset or options.subset and total_entries < options.subset:
            k.shred(sequences, options.kmer)

        E.info("1st shred complete for %i genes" % total_entries)

    elif options.method == 'transcript':
        E.info("shredding transcripts to identify unique kmers")
        for entry in Iterator:
            if total_entries % 1000 == 0:
                E.info("1st shred complete for %i transcripts" % total_entries)

            if options.subset and total_entries >= options.subset:
                break

            k.shred([entry.sequence.upper()], options.kmer)
            total_entries += 1
        E.info("1st shred complete for %i transcripts" % total_entries)

    total_entries = 0
    Iterator = FastaIterator.iterate(IOTools.openFile(options.fasta))

    # iterate fasta entries, shread and count unique kmers
    if options.method == 'gene':
        E.info("re-shredding fasta to count gene unique kmers")

        genes = set()
        current_gene = None
        sequences = []

        for entry in Iterator:

            if options.subset and total_entries >= options.subset:
                break

            transcript_id = entry.title.split()[0]
            gene_id = t2g[transcript_id]

            if gene_id != current_gene:
                if not current_gene:
                    current_gene = gene_id
                    continue

                # check this is the first time we've dealt with this gene?
                assert current_gene not in genes, (
                    "the fasta does not appear to be sorted in gene order, the"
                    " same gene is observed in non-consecutive positions!")
                genes.add(current_gene)

                unique, non_unique = k.countUniqueKmers(
                    sequences, options.kmer)

                fraction = np.divide(float(unique), (unique + non_unique))

                options.stdout.write("%s\n" % "\t".join(
                    map(str, (current_gene, unique, non_unique, fraction))))

                if total_entries % 1000 == 0:
                    E.info("2nd shred complete for %i genes" % total_entries)

                total_entries += 1

                sequences = [entry.sequence.upper()]
                current_gene = gene_id
                total_entries += 1

            else:
                sequences.append(entry.sequence.upper())

        # catch last gene
        if not options.subset or options.subset and total_entries < options.subset:
            unique, non_unique = k.countUniqueKmers(sequences, options.kmer)

            fraction = np.divide(float(unique), (unique + non_unique))

            options.stdout.write(
                "%s\n" %
                "\t".join(map(str, (gene_id, unique, non_unique, fraction))))

    if options.method == 'transcript':
        E.info("re-shredding fasta to count transcript unique kmers")
        for entry in Iterator:

            if total_entries % 1000 == 0:
                E.info("2nd shred complete for %i transcripts" % total_entries)

            if options.subset and total_entries >= options.subset:
                break

            transcript_id = entry.title.split()[0]

            total_entries += 1

            unique, non_unique = k.countUniqueKmers([entry.sequence.upper()],
                                                    options.kmer)

            fraction = np.divide(float(unique), (unique + non_unique))

            options.stdout.write("%s\n" % "\t".join(
                map(str, (transcript_id, unique, non_unique, fraction))))

    E.info("found %i kmers" % len(k.kmer2entry))
    E.info("written kmer counts for %i contigs" % total_entries)
    # write footer and output benchmark information.
    E.info("%s\n" % using("end"))

    E.Stop()
Example #38
0
def main(argv=None):
    """script main.

    parses command line options in sys.argv, unless *argv* is given.
    """

    if not argv:
        argv = sys.argv

    # setup command line parser
    parser = E.OptionParser(version="%prog version: $Id$",
                            usage=globals()["__doc__"])

    parser.add_option("--input-fasta", dest="fasta", type="str",
                      help="name of fasta infile")

    parser.add_option("-k", "--kmer-size", dest="kmer", type="int",
                      help="supply kmer length")

    parser.add_option("--subset", dest="subset", type="int",
                      help="only analyse the first x entries")

    parser.set_defaults(
        fasta=None,
        kmer=10,
        subset=None)

    (options, args) = E.Start(parser)

    E.info("%s\n" % using("start"))

    assert options.fasta, "must provide a fasta filename (--input-fasta=)"

    k = KmerCounter()

    Iterator = FastaIterator.iterate(IOTools.openFile(options.fasta))

    # total entries also acts as the index for the entry_id
    total_entries = 0

    options.stdout.write("%s\n" % "\t".join((
        "id", "unique_kmers", "non_unique_kmers", "fraction_unique")))

    # iterate transcripts, shred and identify unique kmers
    E.info("shredding fasta to identify unique kmers")
    for entry in Iterator:
        if total_entries % 1000 == 0:
            E.info("1st shred complete for %i entries" % total_entries)

        if options.subset and total_entries >= options.subset:
            break

        k.shred(entry.sequence.upper(), options.kmer)
        total_entries += 1

    total_entries = 0
    Iterator = FastaIterator.iterate(IOTools.openFile(options.fasta))

    # iterate transcripts, shread and count unique kmers
    E.info("re-shredding fasta to count unique kmers")

    for entry in Iterator:
        if total_entries % 1000 == 0:
            E.info("2nd shred complete for %i entries" % total_entries)

        transcript_id = entry.title.split()[0]

        if options.subset and total_entries >= options.subset:
            break

        total_entries += 1

        unique, non_unique = k.countUniqueKmers(
            entry.sequence.upper(), options.kmer)

        fraction = np.divide(float(unique), (unique + non_unique))

        options.stdout.write("%s\n" % "\t".join(
            map(str, (transcript_id, unique, non_unique, fraction))))

    E.info("found %i kmers" % len(k.kmer2entry))
    E.info("written kmer counts for %i contigs" % total_entries)
    # write footer and output benchmark information.
    E.info("%s\n" % using("end"))

    E.Stop()
Example #39
0
def main(argv=None):
    """script main.

    parses command line options in sys.argv, unless *argv* is given.
    """

    if argv is None:
        argv = sys.argv

    parser = E.OptionParser(
        version="%prog version: $Id: split_fasta.py 1714 2007-12-11 16:51:12Z andreas $")

    parser.add_option("-f", "--file", dest="input_filename", type="string",
                      help="input filename. If not given, stdin is used.",
                      metavar="FILE")

    parser.add_option("-i", "--input-pattern", dest="input_pattern", type="string",
                      help="input pattern. Parses description line in order to extract id.")

    parser.add_option("-o", "--output-pattern", dest="output_pattern", type="string",
                      help="output pattern. Gives filename for a given sequence.")

    parser.add_option("-n", "--num-sequences", dest="num_sequences", type="int",
                      help="split by number of sequences (not implemented yet).")

    parser.add_option("-m", "--map", dest="map_filename", type="string",
                      help="map filename. Map identifiers to filenames",
                      metavar="FILE")

    parser.add_option("-s", "--skip-identifiers", dest="skip_identifiers", action="store_true",
                      help="do not write identifiers.",
                      metavar="FILE")

    parser.add_option("--min-size", dest="min_size", type="int",
                      help="minimum cluster size.")

    parser.set_defaults(
        input_filename=None,
        map_filename=None,
        skip_identifiers=False,
        input_pattern="^(\S+)",
        min_size=0,
        num_sequences=None,
        output_pattern="%s")

    (options, args) = E.Start(parser)

    if options.input_filename:
        infile = IOTools.openFile(options.input_filename, "r")
    else:
        infile = sys.stdin

    if options.map_filename:
        map_id2filename = IOTools.ReadMap(open(options.map_filename, "r"))
    else:
        map_id2filename = {}

    if options.num_sequences:
        files = FilesChunks(chunk_size=options.num_sequences,
                            output_pattern=options.output_pattern,
                            skip_identifiers=options.skip_identifiers)

    else:
        files = Files(output_pattern=options.output_pattern,
                      skip_identifiers=options.skip_identifiers)

    if options.input_pattern:
        rx = re.compile(options.input_pattern)
    else:
        rx = None

    ninput = 0
    noutput = 0
    identifier = None
    chunk = 0

    for seq in FastaIterator.iterate(infile):

        ninput += 1

        if rx:
            try:
                identifier = rx.search(seq.title).groups()[0]
            except AttributeError:
                print "# parsing error in description line %s" % (seq.title)
        else:
            identifier = seq.title

        if map_id2filename:
            if identifier in map_id2filename:
                identifier = map_id2filename[identifier]
            else:
                continue

        files.Write(identifier, seq)
        noutput += 1

    if options.input_filename:
        infile.close()

    # delete all clusters below a minimum size
    # Note: this has to be done at the end, because
    # clusters sizes are only available once both the fasta
    # file and the map has been parsed.
    if options.min_size:
        ndeleted = files.DeleteFiles(min_size=options.min_size)
    else:
        ndeleted = 0

    if options.loglevel >= 1:
        print "# input=%i, output=%i, ndeleted=%i" % (ninput, noutput, ndeleted)

    E.Stop()
Example #40
0
def main(argv=None):
    """script main.

    parses command line options in sys.argv, unless *argv* is given.
    """

    if not argv: argv = sys.argv

    # setup command line parser
    parser = optparse.OptionParser(
        version=
        "%prog version: $Id: contigs2random_sample.py 2871 2010-03-03 10:20:44Z nicki $",
        usage=globals()["__doc__"])

    parser.add_option(
        "-m",
        "--species-map",
        dest="species_map",
        type="string",
        help="text file specifying the mapping between contig and genome")

    parser.add_option(
        "-g",
        "--genome-dir",
        dest="genome_dir",
        type="string",
        help="specify directory where genome / genomes are stored")

    ## add common options (-h/--help, ...) and parse command line
    (options, args) = E.Start(parser, argv=argv)

    # read in contig lengths into dictionary
    E.info("reading contigs file")
    c_contigs = 0
    contigs_lengths = {}
    for fasta in FastaIterator.iterate(options.stdin):
        c_contigs += 1

        # titles of fasta records must be single strings with no special characters
        contigs_lengths[fasta.title.split(" ")[0]] = len(fasta.sequence)

    E.info("read %i contigs" % c_contigs)

    # read in mapping between spcies and contigs
    species_map = {}
    for line in open(options.species_map).readlines():
        data = line[:-1].split("\t")
        contig, species = data[0], data[1]
        species_map[contig] = species

    # read genomes into memory
    # NB this may need optimisin if using large
    # genomes or many genomes
    E.info("reading genomes from %s" % options.genome_dir)

    # The directory must ONLY contain genome files!!
    genomes_sequences = {}
    c_genomes = 0
    for genome_file in glob.glob(os.path.join(options.genome_dir, "*")):
        c_genomes += 1
        for fasta in FastaIterator.iterate(IOTools.openFile(genome_file)):
            genomes_sequences[fasta.title] = fasta.sequence
    E.info("read %i genomes from %s" % (c_genomes, options.genome_dir))

    # iterate over the contigs and sample from the respective genome
    E.info("iterating over contigs")
    c_contigs_output = 0
    for contig, length in contigs_lengths.iteritems():
        if contig not in species_map:
            E.warn("contig %s not in species map file" % contig)
        else:
            c_contigs_output += 1
            genome = species_map[contig]
            genome_length = len(genomes_sequences[genome])

            # get the start position from which to sample
            start = random.randint(1, genome_length)
            try:
                end = start + length - 1
            except ValueError:
                print "end of sampled contig extends beyond length of genome"

            sampled_seq = genomes_sequences[genome][start:end]
            options.stdout.write(
                ">%s_random\n%s\n" %
                (contig + "_%s" % species_map[contig], sampled_seq))

    E.info("written %i contigs" % c_contigs_output)
    ## write footer and output benchmark information.
    E.Stop()
def runMEMEOnSequences(infile, outfile, background=None, psp=None):
    '''run MEME on fasta sequences to find motifs
   
    By defualt MEME calculates a zero-th order background
    model from the nucleotide frequencies in the input set.

    To use a different background set, a background
    file created by fasta-get-markov must be supplied.

    To perform descrimantive analysis a position specific
    prior (psp) file must be provided. This can be generated
    used generatePSP.

    '''
    # job_options = "-l mem_free=8000M"

    nseqs = int(FastaIterator.count(infile))
    if nseqs < 2:
        E.warn("%s: less than 2 sequences - meme skipped" % outfile)
        P.touch(outfile)
        return

    # Get the total length of the sequences to decide the memory
    total_seqs_length = 0

    with IOTools.open_file(infile, "r") as fasta_reader:

        iterator_fasta = FastaIterator.iterate(fasta_reader)

        for fasta_seq in iterator_fasta:
            total_seqs_length += len(fasta_seq.sequence)

    fasta_reader.close()

    # If the length of all sequences is higher than 160,000bp
    # Up the memory
    job_memory = "2G"

    if (total_seqs_length > 160000):
        job_memory = "4G"

    if PARAMS.get("meme_revcomp", True):
        revcomp = "-revcomp"
    else:
        revcomp = ""

    target_path = os.path.join(os.path.abspath(PARAMS["exportdir"]), outfile)
    tmpdir = P.get_temp_dir(".")
    if background:
        background_model = "-bfile %s" % background
    else:
        background_model = ""

    if psp:
        E.info("Running MEME in descriminative mode")
        psp_file = "-psp %s" % psp
    else:
        psp_file = ""

    statement = '''
    meme %(infile)s -dna %(revcomp)s
    -p %(meme_threads)s
    -mod %(meme_model)s
    -nmotifs %(meme_nmotifs)s
    -oc %(tmpdir)s
    -maxsize %(meme_max_size)s
    %(background_model)s
    %(psp_file)s
    %(meme_options)s
       2> %(outfile)s.log
    '''

    # If running with more than one thread
    # http://git.net/ml/clustering.gridengine.users/2007-04/msg00058.html
    # specify "excl=false -w n -pe openmpi-ib num_threads" in cluster_options
    # through job_options
    if int(PARAMS["meme_threads"]) != 1:
        job_options = str(PARAMS["meme_job_options"])
        job_threads = int(PARAMS["meme_threads"])
        cluster_parallel_environment = str(
            PARAMS["meme_cluster_parallel_environment"])

    P.run(statement)

    collectMEMEResults(tmpdir, target_path, outfile)
def main(argv=None):
    """script main.

    parses command line options in sys.argv, unless *argv* is given.
    """

    if not argv:
        argv = sys.argv

    # setup command line parser
    parser = optparse.OptionParser(version="%prog version: $Id: contigs2random_sample.py 2871 2010-03-03 10:20:44Z nicki $",
                                   usage=globals()["__doc__"])

    parser.add_option("-m", "--species-map", dest="species_map", type="string",
                      help="text file specifying the mapping between contig and genome")

    parser.add_option("-g", "--genome-dir", dest="genome_dir", type="string",
                      help="specify directory where genome / genomes are stored")

    # add common options (-h/--help, ...) and parse command line
    (options, args) = E.Start(parser, argv=argv)

    # read in contig lengths into dictionary
    E.info("reading contigs file")
    c_contigs = 0
    contigs_lengths = {}
    for fasta in FastaIterator.iterate(options.stdin):
        c_contigs += 1

        # titles of fasta records must be single strings with no special
        # characters
        contigs_lengths[fasta.title.split(" ")[0]] = len(fasta.sequence)

    E.info("read %i contigs" % c_contigs)

    # read in mapping between spcies and contigs
    species_map = {}
    for line in open(options.species_map).readlines():
        data = line[:-1].split("\t")
        contig, species = data[0], data[1]
        species_map[contig] = species

    # read genomes into memory
    # NB this may need optimisin if using large
    # genomes or many genomes
    E.info("reading genomes from %s" % options.genome_dir)

    # The directory must ONLY contain genome files!!
    genomes_sequences = {}
    c_genomes = 0
    for genome_file in glob.glob(os.path.join(options.genome_dir, "*")):
        c_genomes += 1
        for fasta in FastaIterator.iterate(IOTools.openFile(genome_file)):
            genomes_sequences[fasta.title] = fasta.sequence
    E.info("read %i genomes from %s" % (c_genomes, options.genome_dir))

    # iterate over the contigs and sample from the respective genome
    E.info("iterating over contigs")
    c_contigs_output = 0
    for contig, length in contigs_lengths.iteritems():
        if contig not in species_map:
            E.warn("contig %s not in species map file" % contig)
        else:
            c_contigs_output += 1
            genome = species_map[contig]
            genome_length = len(genomes_sequences[genome])

            # get the start position from which to sample
            start = random.randint(1, genome_length)
            try:
                end = start + length - 1
            except ValueError:
                print "end of sampled contig extends beyond length of genome"

            sampled_seq = genomes_sequences[genome][start:end]
            options.stdout.write(
                ">%s_random\n%s\n" % (contig + "_%s" % species_map[contig], sampled_seq))

    E.info("written %i contigs" % c_contigs_output)
    # write footer and output benchmark information.
    E.Stop()
Example #43
0
def main(argv=None):
    """script main.

    parses command line options in sys.argv, unless *argv* is given.
    """

    if not argv:
        argv = sys.argv

    # setup command line parser
    parser = optparse.OptionParser(
        version=
        "%prog version: $Id: script_template.py 2871 2010-03-03 10:20:44Z andreas $",
        usage=globals()["__doc__"])

    parser.add_option("-b",
                      "--bam-file",
                      dest="bamfile",
                      type="string",
                      help="supply bam file")

    # add common options (-h/--help, ...) and parse command line
    (options, args) = E.Start(parser, argv=argv)

    # read in contigs
    E.info("reading in contig file")
    contigs = {}
    for fasta in FastaIterator.iterate(options.stdin):
        contigs[fasta.title] = (1, len(fasta.sequence) - 1)
    E.info("read %i contigs" % len(contigs.keys()))

    # read in bamfile
    E.info("reading bam file")
    samfile = pysam.Samfile(options.bamfile)

    E.info("iterating over contigs")
    c = 0
    for contig, coords in contigs.iteritems():
        coords = list(coords)

        #################################
        # NB this is specific for my data!
        contig = contig.split(" ")[0]
        #################################

        species_counts = collections.defaultdict(int)
        for alignment in samfile.fetch(contig, coords[0], coords[1]):
            species_id = alignment.qname.split("|")[1]
            species_counts[species_id] += 1

        # at the moment ignore if there are no counts
        if len(species_counts.values()) == 0:
            E.warn("no reads map to %s" % contig)
            continue

        for species, count in species_counts.iteritems():
            if species_counts[species] == max(species_counts.values()):
                top_dog = species
                c += 1
                break
        E.info("species %s assigned to contig number %i" % (top_dog, c))
        options.stdout.write("%s\t%s\n" % (contig, top_dog))

    # write footer and output benchmark information.
    E.Stop()
Example #44
0
def main(argv=None):

    if argv is None:
        argv = sys.argv

    parser = E.OptionParser(version="%prog version: $Id$",
                            usage=globals()["__doc__"])

    parser.add_option(
        "-s", "--correct-gap-shift", dest="correct_shift",
        action="store_true",
        help="correct gap length shifts in alignments. "
        "Requires alignlib_lite.py [%default]")

    parser.add_option(
        "-1", "--pattern1", dest="pattern1", type="string",
        help="pattern to extract identifier from in identifiers1. "
        "[%default]")

    parser.add_option(
        "-2", "--pattern2", dest="pattern2", type="string",
        help="pattern to extract identifier from in identifiers2. "
        "[%default]")

    parser.add_option(
        "-o", "--output-section", dest="output", type="choice",
        action="append",
        choices=("diff", "missed", "seqdiff"),
        help="what to output [%default]")

    parser.set_defaults(correct_shift=False,
                        pattern1="(\S+)",
                        pattern2="(\S+)",
                        output=[])

    (options, args) = E.Start(parser)

    if len(args) != 2:
        raise ValueError("two files needed to compare.")

    if options.correct_shift:
        try:
            import alignlib_lite
        except ImportError:
            raise ImportError(
                "option --correct-shift requires alignlib_lite.py_ "
                "but alignlib not found")

    seqs1 = dict([
        (x.title, x.sequence) for x in FastaIterator.iterate(
            IOTools.openFile(args[0], "r"))])
    seqs2 = dict([
        (x.title, x.sequence) for x in FastaIterator.iterate(
            IOTools.openFile(args[1], "r"))])

    if not seqs1:
        raise ValueError("first file %s is empty." % (args[0]))
    if not seqs2:
        raise ValueError("second file %s is empty." % (args[1]))

    MapIdentifiers(seqs1, options.pattern1)
    MapIdentifiers(seqs2, options.pattern2)

    nsame = 0
    nmissed1 = 0
    nmissed2 = 0
    ndiff = 0
    ndiff_first = 0
    ndiff_last = 0
    ndiff_prefix = 0
    ndiff_selenocysteine = 0
    ndiff_masked = 0
    nfixed = 0
    found2 = {}

    write_missed1 = "missed" in options.output
    write_missed2 = "missed" in options.output
    write_seqdiff = "seqdiff" in options.output
    write_diff = "diff" in options.output or write_seqdiff

    for k in sorted(seqs1):
        if k not in seqs2:
            nmissed1 += 1
            if write_missed1:
                options.stdout.write("---- %s ---- %s\n" % (k, "missed1"))
            continue

        found2[k] = 1

        s1 = seqs1[k].upper()
        s2 = seqs2[k].upper()
        m = min(len(s1), len(s2))

        if s1 == s2:
            nsame += 1
        else:
            status = "other"

            ndiff += 1

            if s1[1:] == s2[1:]:
                ndiff_first += 1
                status = "first"
            elif s1[:m] == s2[:m]:
                ndiff_prefix += 1
                status = "prefix"
            elif s1[:-1] == s2[:-1]:
                ndiff_last += 1
                status = "last"
            else:
                if len(s1) == len(s2):
                    # get all differences: the first and last residues
                    # can be different for peptide sequences when
                    # comparing my translations with ensembl peptides.
                    differences = []
                    for x in range(1, len(s1) - 1):
                        if s1[x] != s2[x]:
                            differences.append((s1[x], s2[x]))

                    l = len(differences)
                    # check for Selenocysteins
                    if len([x for x in differences if x[0] == "U" or x[1] == "U"]) == l:
                        ndiff_selenocysteine += 1
                        status = "selenocysteine"

                    # check for masked residues
                    elif len([x for x in differences if x[0] in "NX" or x[1] in "NX"]) == l:
                        ndiff_masked += 1
                        status = "masked"

            # correct for different gap lengths
            if options.correct_shift:

                map_a2b = alignlib_lite.py_makeAlignmentVector()

                a, b = 0, 0
                keep = False

                x = 0
                while x < m and not (a == len(s1) and b == len(s2)):
                    try:
                        if s1[a] != s2[b]:
                            while s1[a] == "N" and s2[b] != "N":
                                a += 1
                            while s1[a] != "N" and s2[b] == "N":
                                b += 1

                            if s1[a] != s2[b]:
                                break
                    except IndexError:
                        print("# index error for %s: x=%i, a=%i, b=%i, l1=%i, l2=%i" % (k, x, a, b, len(s1), len(s2)))
                        break

                    a += 1
                    b += 1
                    map_a2b.addPairExplicit(a, b, 0.0)
                    # check if we have reached the end:
                else:
                    keep = True
                    nfixed += 1
                    f = alignlib_lite.py_AlignmentFormatEmissions(map_a2b)
                    print("fix\t%s\t%s" % (k, str(f)))

                if not keep:
                    print("# warning: not fixable: %s" % k)

            if write_diff:
                options.stdout.write("---- %s ---- %s\n" % (k, status))

            if write_seqdiff:
                options.stdout.write("< %s\n> %s\n" % (seqs1[k], seqs2[k]))

    for k in sorted(list(seqs2.keys())):
        if k not in found2:
            nmissed2 += 1
            if write_missed2:
                options.stdout.write("---- %s ---- %s\n" % (k, "missed2"))

    options.stdlog.write("""# Legend:
""")

    E.info("seqs1=%i, seqs2=%i, same=%i, ndiff=%i, nmissed1=%i, nmissed2=%i" %
           (len(seqs1), len(seqs2), nsame, ndiff, nmissed1, nmissed2))

    E.info(
        "ndiff=%i: first=%i, last=%i, prefix=%i, selenocysteine=%i, masked=%i, fixed=%i, other=%i" %
        (ndiff, ndiff_first, ndiff_last, ndiff_prefix,
         ndiff_selenocysteine, ndiff_masked, nfixed,
         ndiff - ndiff_first - ndiff_last - ndiff_prefix -
         ndiff_selenocysteine - ndiff_masked - nfixed))

    E.Stop()
Example #45
0
        
    else:
        files = Files( output_pattern = options.output_pattern,
                       skip_identifiers = options.skip_identifiers )

    if options.input_pattern:
        rx = re.compile( options.input_pattern )
    else:
        rx = None

    ninput = 0
    noutput = 0
    identifier = None
    chunk = 0
    
    for seq in FastaIterator.iterate( infile ):
        
        ninput += 1

        if rx:
            try:
                identifier = rx.search(seq.title).groups()[0]
            except AttributeError:
                print "# parsing error in description line %s" % (seq.title)
        else:
            identifier = seq.title

        if map_id2filename:
            if identifier in map_id2filename:
                identifier = map_id2filename[identifier]
            else:
Example #46
0
def main(argv=None):
    """script main.

    parses command line options in sys.argv, unless *argv* is given.
    """

    if argv == None: argv = sys.argv

    parser = E.OptionParser(
        version=
        "%prog version: $Id: sequences2mali.py 2782 2009-09-10 11:40:29Z andreas $",
        usage=globals()["__doc__"])

    parser.add_option("-i",
                      "--input-format",
                      dest="input_format",
                      type="choice",
                      choices=("plain", "fasta", "clustal", "stockholm",
                               "phylip"),
                      help="input format of multiple alignment")

    parser.add_option("-o",
                      "--output-format",
                      dest="output_format",
                      type="choice",
                      choices=("plain", "fasta", "stockholm", "phylip"),
                      help="output format of multiple alignment")

    parser.add_option("-m",
                      "--method",
                      dest="method",
                      type="choice",
                      choices=("add", ),
                      help="""method to use to build multiple alignment.""")

    parser.add_option("-p",
                      "--parameters",
                      dest="parameters",
                      type="string",
                      help="parameter stack for methods that require one.")

    parser.add_option("-a",
                      "--alignment-method",
                      dest="alignment_method",
                      type="choice",
                      choices=("sw", "nw"),
                      help="alignment_method [%default].")

    parser.set_defaults(
        input_format="fasta",
        output_format="fasta",
        method=None,
        parameters="",
        gop=-10.0,
        gep=-1.0,
        alignment_method="sw",
    )

    (options, args) = E.Start(parser)

    options.parameters = options.parameters.split(",")

    iterator = FastaIterator.iterate(sys.stdin)

    if options.method == "add":

        mali = Mali.Mali()

        mali.readFromFile(open(options.parameters[0], "r"),
                          format=options.input_format)
        del options.parameters[0]

        old_length = mali.getLength()

        new_mali = convertMali2Mali(mali)

        if options.alignment_method == "sw":
            alignator = alignlib_lite.py_makeAlignatorFullDP(
                options.gop, options.gep)
        else:
            alignator = alignlib_lite.py_makeAlignatorFullDPGlobal(
                options.gop, options.gep)

        while 1:
            cur_record = iterator.next()
            if cur_record is None: break

            map_mali2seq = alignlib_lite.py_makeAlignataVector()

            sequence = alignlib_lite.py_makeSequence(cur_record.sequence)
            profile = alignlib_lite.py_makeProfileFromMali(new_mali)

            if options.loglevel >= 4:
                options.stdlog.write(profile.Write())

            alignator.Align(profile, sequence, map_mali2seq)

            if options.loglevel >= 3:
                options.stdlog.write(map_mali2seq.Write())

            ## add sequence to mali
            a = alignlib_lite.py_makeAlignatumFromString(cur_record.sequence)
            a.thisown = 0

            new_mali.addAlignatum(a, map_mali2seq, 1, 1, 1, 1, 1)

            id = cur_record.title
            mali.mIdentifiers.append(id)
            mali.mMali[id] = Mali.AlignedString(
                id, 0, len(cur_record.sequence),
                new_mali.getRow(new_mali.getWidth() - 1).getString())

        # substitute
        for x in range(old_length):
            mali.mMali[mali.mIdentifiers[x]].mString = new_mali.getRow(
                x).getString()

        mali.writeToFile(sys.stdout, format=options.output_format)

    E.Stop()
def findTATABox(infiles, outfile):
    '''find TATA box in promotors. There are several matrices to choose from:

    M00216 V$TATA_C Retroviral TATA box
    M00252 V$TATA_01 cellular and viral TATA box elements
    M00311 V$ATATA_B Avian C-type TATA box
    M00320 V$MTATA_B Muscle TATA box
    '''

    # 1. create fasta file - look for TATA box
    #
    bedfile, genomefile = infiles

    statement = '''
    slopBed -i %(bedfile)s
            -l %(tata_search_upstream)i
            -r %(tata_search_downstream)i
            -s
            -g %(genomefile)s
    | cgat bed2fasta 
       --use-strand
       --genome=%(genome_dir)s/%(genome)s
       --log=%(outfile)s.log
    > %(outfile)s.fasta
    '''

    P.run()

    match_executable = '/ifs/data/biobase/transfac/match/bin/match_linux64'
    match_matrix = '/ifs/data/biobase/transfac/dat/matrix.dat'
    match_profile = 'minFP_good.prf'
    match_profile = outfile + ".prf"

    prf = '''tata.prf
prf to minimize sum of both errors - derived from minSUM.prf
 MIN_LENGTH 300
0.0
 1.000 0.716 0.780 M00216 V$TATA_C
 1.000 0.738 0.856 M00252 V$TATA_01
 1.000 0.717 0.934 M00311 V$ATATA_B
 1.000 0.711 0.784 M00320 V$MTATA_B
//
'''

    with IOTools.openFile(match_profile, "w") as outf:
        outf.write(prf)

    # -u : uniq - only one best match per sequence
    statement = '''
         %(match_executable)s
         %(match_matrix)s
         %(outfile)s.fasta
         %(outfile)s.match
         %(match_profile)s
         -u
    >> %(outfile)s.log
    '''
    P.run()

    transcript2pos = {}
    for entry in FastaIterator.iterate(IOTools.openFile(outfile + ".fasta")):
        transcript_id, contig, start, end, strand = re.match(
            "(\S+)\s+(\S+):(\d+)..(\d+)\s+\((\S)\)", entry.title).groups()
        transcript2pos[transcript_id] = (contig, int(start), int(end), strand)

    MATCH = collections.namedtuple(
        "MATCH", "pid transfac_id pos strand core_similarity matrix_similarity sequence")

    def _grouper(infile):
        r = []
        keep = False
        for line in infile:
            if line.startswith("Inspecting sequence ID"):
                keep = True
                if r:
                    yield pid, r
                r = []
                pid = re.match(
                    "Inspecting sequence ID\s+(\S+)", line).groups()[0]
                continue
            elif line.startswith(" Total"):
                break

            if not keep:
                continue
            if line[:-1].strip() == "":
                continue
            transfac_id, v, core_similarity, matrix_similarity, sequence = [
                x.strip() for x in line[:-1].split("|")]
            pos, strand = re.match("(\d+) \((\S)\)", v).groups()
            r.append(MATCH._make((pid, transfac_id, int(pos), strand,
                                  float(core_similarity), float(matrix_similarity), sequence)))

        yield pid, r

    offset = PARAMS["tata_search_upstream"]

    outf = IOTools.openFile(outfile + ".table.gz", "w")
    outf.write("\t".join(("transcript_id", "strand",
                          "start", "end",
                          "relative_start", "relative_end",
                          "transfac_id",
                          "core_similarity",
                          "matrix_similarity",
                          "sequence")) + "\n")

    bedf = IOTools.openFile(outfile, "w")

    c = E.Counter()
    found = set()
    for transcript_id, matches in _grouper(IOTools.openFile(outfile + ".match")):
        contig, seq_start, seq_end, strand = transcript2pos[transcript_id]
        c.promotor_with_matches += 1
        nmatches = 0
        found.add(transcript_id)
        for match in matches:

            c.matches_total += 1
            lmatch = len(match.sequence)
            if match.strand == "-":
                c.matches_wrong_strand += 1
                continue

            # get genomic location of match
            if strand == "+":
                genome_start = seq_start + match.pos
            else:
                genome_start = seq_end - match.pos - lmatch

            genome_end = genome_start + lmatch

            # get relative location of match
            if strand == "+":
                tss_start = seq_start + offset
                relative_start = genome_start - tss_start
            else:
                tss_start = seq_end - offset
                relative_start = tss_start - genome_end

            relative_end = relative_start + lmatch

            outf.write("\t".join(map(str, (
                transcript_id, strand,
                genome_start, genome_end,
                relative_start, relative_end,
                match.transfac_id,
                match.core_similarity,
                match.matrix_similarity,
                match.sequence))) + "\n")
            c.matches_output += 1
            nmatches += 1

            bedf.write("\t".join(map(
                str,
                (contig, genome_start, genome_end, transcript_id, strand,
                 match.matrix_similarity))) + "\n")

        if nmatches == 0:
            c.promotor_filtered += 1
        else:
            c.promotor_output += 1

    c.promotor_total = len(transcript2pos)
    c.promotor_without_matches = len(
        set(transcript2pos.keys()).difference(found))

    outf.close()
    bedf.close()

    with IOTools.openFile(outfile + ".summary", "w") as outf:
        outf.write("category\tcounts\n")
        outf.write(c.asTable() + "\n")

    E.info(c)
Example #48
0
def main(argv=None):

    parser = E.OptionParser(version="%prog version: $Id$",
                            usage=globals()["__doc__"])

    parser.add_option("-c",
                      "--is-cds",
                      dest="is_cds",
                      action="store_true",
                      help="input are cds (nucleotide) sequences [%default]")

    parser.set_defaults(is_cds=False, )

    (options, args) = E.Start(parser, argv=argv)

    options.stdout.write(
        "snpid\tidentifier\tpos\treference\tvariant\tcounts\tweight\n")

    alphabet = "ACDEFGHIKLMNPQRSTVWY"

    snpid = 0

    for entry in FastaIterator.iterate(options.stdin):
        identifier = entry.title

        if options.is_cds:
            cds_sequence = entry.sequence.upper()
            assert len(cds_sequence) % 3 == 0, \
                "length of sequence '%s' is not a multiple of 3" % entry.title

            sequence = Genomics.translate(cds_sequence)
            weights = []
            for pos, cds_pos in enumerate(range(0, len(cds_sequence), 3)):
                codon = cds_sequence[cds_pos:cds_pos + 3]
                counts = collections.defaultdict(int)
                for x in range(0, 3):
                    rna = codon[x]
                    for na in "ACGT":
                        if na == rna:
                            continue
                        taa = Genomics.translate(codon[:x] + na +
                                                 codon[x + 1:])
                        counts[taa] += 1
                weights.append(counts)

        else:
            sequence = entry.sequence.upper()
            counts = {}
            for x in alphabet:
                counts[x] = 1
            weights = [counts] * len(sequence)

        for pos, ref in enumerate(sequence):

            if ref not in alphabet:
                continue
            w = weights[pos]
            t = float(sum(w.values()))
            for variant in alphabet:
                if variant == ref:
                    continue
                snpid += 1
                options.stdout.write("%s\n" % "\t".join((
                    "%010i" % snpid,
                    identifier,
                    str(pos + 1),
                    ref,
                    variant,
                    "%i" % w[variant],
                    "%6.4f" % (w[variant] / t),
                )))

    E.Stop()
Example #49
0
def main(argv=None):
    """script main.

    parses command line options in sys.argv, unless *argv* is given.
    """

    if not argv:
        argv = sys.argv

    # setup command line parser
    parser = E.OptionParser(version="%prog version: $Id$",
                            usage=globals()["__doc__"])

    parser.add_option("--input-fasta",
                      dest="fasta",
                      type="str",
                      help="name of fasta infile")

    parser.add_option("-k",
                      "--kmer-size",
                      dest="kmer",
                      type="int",
                      help="supply kmer length")

    parser.add_option("--subset",
                      dest="subset",
                      type="int",
                      help="only analyse the first x entries")

    parser.set_defaults(fasta=None, kmer=10, subset=None)

    (options, args) = E.Start(parser)

    E.info("%s\n" % using("start"))

    assert options.fasta, "must provide a fasta filename (--input-fasta=)"

    k = KmerCounter()

    Iterator = FastaIterator.iterate(IOTools.openFile(options.fasta))

    # total entries also acts as the index for the entry_id
    total_entries = 0

    options.stdout.write("%s\n" % "\t".join(
        ("id", "unique_kmers", "non_unique_kmers", "fraction_unique")))

    # iterate transcripts, shred and identify unique kmers
    E.info("shredding fasta to identify unique kmers")
    for entry in Iterator:
        if total_entries % 1000 == 0:
            E.info("1st shred complete for %i entries" % total_entries)

        if options.subset and total_entries >= options.subset:
            break

        k.shred(entry.sequence.upper(), options.kmer)
        total_entries += 1

    total_entries = 0
    Iterator = FastaIterator.iterate(IOTools.openFile(options.fasta))

    # iterate transcripts, shread and count unique kmers
    E.info("re-shredding fasta to count unique kmers")

    for entry in Iterator:
        if total_entries % 1000 == 0:
            E.info("2nd shred complete for %i entries" % total_entries)

        transcript_id = entry.title.split()[0]

        if options.subset and total_entries >= options.subset:
            break

        total_entries += 1

        unique, non_unique = k.countUniqueKmers(entry.sequence.upper(),
                                                options.kmer)

        fraction = np.divide(float(unique), (unique + non_unique))

        options.stdout.write(
            "%s\n" %
            "\t".join(map(str, (transcript_id, unique, non_unique, fraction))))

    E.info("found %i kmers" % len(k.kmer2entry))
    E.info("written kmer counts for %i contigs" % total_entries)
    # write footer and output benchmark information.
    E.info("%s\n" % using("end"))

    E.Stop()
Example #50
0
def main(argv=None):
    """script main.

    parses command line options in sys.argv, unless *argv* is given.
    """

    if argv is None:
        argv = sys.argv

    parser = E.OptionParser(
        version="%prog version: $Id: sequences2mali.py 2782 2009-09-10 11:40:29Z andreas $", usage=globals()["__doc__"])

    parser.add_option("-i", "--input-format", dest="input_format", type="choice",
                      choices=(
                          "plain", "fasta", "clustal", "stockholm", "phylip"),
                      help="input format of multiple alignment")

    parser.add_option("-o", "--output-format", dest="output_format", type="choice",
                      choices=("plain", "fasta", "stockholm", "phylip"),
                      help="output format of multiple alignment")

    parser.add_option("-m", "--method", dest="method", type="choice",
                      choices=("add",),
                      help="""method to use to build multiple alignment.""")

    parser.add_option("-p", "--parameters", dest="parameters", type="string",
                      help="parameter stack for methods that require one.")

    parser.add_option("-a", "--alignment-method", dest="alignment_method", type="choice",
                      choices=("sw", "nw"),
                      help="alignment_method [%default].")

    parser.set_defaults(
        input_format="fasta",
        output_format="fasta",
        method=None,
        parameters="",
        gop=-10.0,
        gep=-1.0,
        alignment_method="sw",
    )

    (options, args) = E.Start(parser)

    options.parameters = options.parameters.split(",")

    iterator = FastaIterator.iterate(sys.stdin)

    if options.method == "add":

        mali = Mali.Mali()

        mali.readFromFile(
            open(options.parameters[0], "r"), format=options.input_format)
        del options.parameters[0]

        old_length = mali.getLength()

        new_mali = convertMali2Mali(mali)

        if options.alignment_method == "sw":
            alignator = alignlib_lite.py_makeAlignatorFullDP(
                options.gop, options.gep)
        else:
            alignator = alignlib_lite.py_makeAlignatorFullDPGlobal(
                options.gop, options.gep)

        while 1:
            cur_record = iterator.next()
            if cur_record is None:
                break

            map_mali2seq = alignlib_lite.py_makeAlignataVector()

            sequence = alignlib_lite.py_makeSequence(cur_record.sequence)
            profile = alignlib_lite.py_makeProfileFromMali(new_mali)

            if options.loglevel >= 4:
                options.stdlog.write(profile.Write())

            alignator.Align(profile, sequence, map_mali2seq)

            if options.loglevel >= 3:
                options.stdlog.write(map_mali2seq.Write())

            # add sequence to mali
            a = alignlib_lite.py_makeAlignatumFromString(cur_record.sequence)
            a.thisown = 0

            new_mali.addAlignatum(a, map_mali2seq, 1, 1, 1, 1, 1)

            id = cur_record.title
            mali.mIdentifiers.append(id)
            mali.mMali[id] = Mali.AlignedString(id, 0, len(
                cur_record.sequence), new_mali.getRow(new_mali.getWidth() - 1).getString())

        # substitute
        for x in range(old_length):
            mali.mMali[mali.mIdentifiers[x]].mString = new_mali.getRow(
                x).getString()

        mali.writeToFile(sys.stdout, format=options.output_format)

    E.Stop()
Example #51
0
def main(argv=None):
    """script main.

    parses command line options in sys.argv, unless *argv* is given.
    """

    if argv is None:
        argv = sys.argv

    parser = E.OptionParser(
        version=
        "%prog version: $Id: split_fasta.py 1714 2007-12-11 16:51:12Z andreas $"
    )

    parser.add_option("-f",
                      "--file",
                      dest="input_filename",
                      type="string",
                      help="input filename. If not given, stdin is used.",
                      metavar="FILE")

    parser.add_option(
        "-i",
        "--input-pattern",
        dest="input_pattern",
        type="string",
        help="input pattern. Parses description line in order to extract id.")

    parser.add_option(
        "-o",
        "--output-filename-pattern",
        dest="output_pattern",
        type="string",
        help="output pattern. Gives filename for a given sequence.")

    parser.add_option(
        "-n",
        "--num-sequences",
        dest="num_sequences",
        type="int",
        help="split by number of sequences (not implemented yet).")

    parser.add_option("-m",
                      "--map",
                      dest="map_filename",
                      type="string",
                      help="map filename. Map identifiers to filenames",
                      metavar="FILE")

    parser.add_option("-s",
                      "--skip-identifiers",
                      dest="skip_identifiers",
                      action="store_true",
                      help="do not write identifiers.",
                      metavar="FILE")

    parser.add_option("--min-size",
                      dest="min_size",
                      type="int",
                      help="minimum cluster size.")

    parser.set_defaults(input_filename=None,
                        map_filename=None,
                        skip_identifiers=False,
                        input_pattern="^(\S+)",
                        min_size=0,
                        num_sequences=None,
                        output_pattern="%s")

    (options, args) = E.start(parser)

    if options.input_filename:
        infile = IOTools.open_file(options.input_filename, "r")
    else:
        infile = sys.stdin

    if options.map_filename:
        map_id2filename = IOTools.ReadMap(open(options.map_filename, "r"))
    else:
        map_id2filename = {}

    if options.num_sequences:
        files = FilesChunks(chunk_size=options.num_sequences,
                            output_pattern=options.output_pattern,
                            skip_identifiers=options.skip_identifiers)

    else:
        files = Files(output_pattern=options.output_pattern,
                      skip_identifiers=options.skip_identifiers)

    if options.input_pattern:
        rx = re.compile(options.input_pattern)
    else:
        rx = None

    ninput = 0
    noutput = 0
    identifier = None
    chunk = 0

    for seq in FastaIterator.iterate(infile):

        ninput += 1

        if rx:
            try:
                identifier = rx.search(seq.title).groups()[0]
            except AttributeError:
                print("# parsing error in description line %s" % (seq.title))
        else:
            identifier = seq.title

        if map_id2filename:
            if identifier in map_id2filename:
                identifier = map_id2filename[identifier]
            else:
                continue

        files.Write(identifier, seq)
        noutput += 1

    if options.input_filename:
        infile.close()

    # delete all clusters below a minimum size
    # Note: this has to be done at the end, because
    # clusters sizes are only available once both the fasta
    # file and the map has been parsed.
    if options.min_size:
        ndeleted = files.DeleteFiles(min_size=options.min_size)
    else:
        ndeleted = 0

    if options.loglevel >= 1:
        print("# input=%i, output=%i, ndeleted=%i" %
              (ninput, noutput, ndeleted))

    E.stop()
Example #52
0
def main(argv=None):
    """script main.

    parses command line options in sys.argv, unless *argv* is given.
    """

    if not argv:
        argv = sys.argv

    # setup command line parser
    parser = E.OptionParser(version="%prog version: $Id$", usage=globals()["__doc__"])

    parser.add_option("--input-fasta", dest="fasta", type="str", help="name of fasta infile")

    parser.add_option(
        "--method",
        dest="method",
        type="choice",
        choices=("transcript", "gene"),
        help="count unique kmers per transcript or gene",
    )

    parser.add_option("--genemap", dest="genemap", type="str", help="file mapping transcripts to genes")

    parser.add_option("-k", "--kmer-size", dest="kmer", type="int", help="supply kmer length")

    parser.add_option("--subset", dest="subset", type="int", help="only analyse the first x entries")

    parser.set_defaults(fasta=None, method="transcript", genemap=None, kmer=10, subset=None)

    (options, args) = E.Start(parser)

    E.info("%s\n" % using("start"))

    assert options.fasta, "must provide a fasta filename (--input-fasta=)"

    k = KmerCounter()

    Iterator = FastaIterator.iterate(IOTools.openFile(options.fasta))

    # total entries also acts as the index for the entry_id
    total_entries = 0

    options.stdout.write("%s\n" % "\t".join(("id", "unique_kmers", "non_unique_kmers", "fraction_unique")))

    # iterate fasta entries, shred and identify kmers

    if options.method == "gene":
        E.info("shredding genes to identify unique kmers")

        assert options.genemap, (
            "to perform a gene-level unique kmer count, " "you must supply a transcript2gene map (--genemap)"
        )
        t2g = {}
        with IOTools.openFile(options.genemap, "r") as inf:
            for line in inf:
                transcript, gene = line.strip().split("\t")
                t2g[transcript] = gene

        genes = set()
        current_gene = None
        sequences = []

        for entry in Iterator:

            if options.subset and total_entries >= options.subset:
                break

            transcript_id = entry.title.split()[0]
            gene_id = t2g[transcript_id]

            if gene_id != current_gene:
                if not current_gene:
                    current_gene = gene_id
                    continue

                # check this is the first time we've dealt with this gene?
                assert current_gene not in genes, (
                    "the fasta does not appear to be sorted in gene order, the"
                    " same gene is observed in non-consecutive positions!"
                )

                genes.add(current_gene)

                k.shred(sequences, options.kmer)

                if total_entries % 1000 == 0:
                    E.info("1st shred complete for %i genes" % total_entries)

                total_entries += 1

                sequences = [entry.sequence.upper()]
                current_gene = gene_id

            else:
                sequences.append(entry.sequence.upper())

        # catch last gene
        if not options.subset or options.subset and total_entries < options.subset:
            k.shred(sequences, options.kmer)

        E.info("1st shred complete for %i genes" % total_entries)

    elif options.method == "transcript":
        E.info("shredding transcripts to identify unique kmers")
        for entry in Iterator:
            if total_entries % 1000 == 0:
                E.info("1st shred complete for %i transcripts" % total_entries)

            if options.subset and total_entries >= options.subset:
                break

            k.shred([entry.sequence.upper()], options.kmer)
            total_entries += 1
        E.info("1st shred complete for %i transcripts" % total_entries)

    total_entries = 0
    Iterator = FastaIterator.iterate(IOTools.openFile(options.fasta))

    # iterate fasta entries, shread and count unique kmers
    if options.method == "gene":
        E.info("re-shredding fasta to count gene unique kmers")

        genes = set()
        current_gene = None
        sequences = []

        for entry in Iterator:

            if options.subset and total_entries >= options.subset:
                break

            transcript_id = entry.title.split()[0]
            gene_id = t2g[transcript_id]

            if gene_id != current_gene:
                if not current_gene:
                    current_gene = gene_id
                    continue

                # check this is the first time we've dealt with this gene?
                assert current_gene not in genes, (
                    "the fasta does not appear to be sorted in gene order, the"
                    " same gene is observed in non-consecutive positions!"
                )
                genes.add(current_gene)

                unique, non_unique = k.countUniqueKmers(sequences, options.kmer)

                fraction = np.divide(float(unique), (unique + non_unique))

                options.stdout.write("%s\n" % "\t".join(map(str, (current_gene, unique, non_unique, fraction))))

                if total_entries % 1000 == 0:
                    E.info("2nd shred complete for %i genes" % total_entries)

                total_entries += 1

                sequences = [entry.sequence.upper()]
                current_gene = gene_id
                total_entries += 1

            else:
                sequences.append(entry.sequence.upper())

        # catch last gene
        if not options.subset or options.subset and total_entries < options.subset:
            unique, non_unique = k.countUniqueKmers(sequences, options.kmer)

            fraction = np.divide(float(unique), (unique + non_unique))

            options.stdout.write("%s\n" % "\t".join(map(str, (gene_id, unique, non_unique, fraction))))

    if options.method == "transcript":
        E.info("re-shredding fasta to count transcript unique kmers")
        for entry in Iterator:

            if total_entries % 1000 == 0:
                E.info("2nd shred complete for %i transcripts" % total_entries)

            if options.subset and total_entries >= options.subset:
                break

            transcript_id = entry.title.split()[0]

            total_entries += 1

            unique, non_unique = k.countUniqueKmers([entry.sequence.upper()], options.kmer)

            fraction = np.divide(float(unique), (unique + non_unique))

            options.stdout.write("%s\n" % "\t".join(map(str, (transcript_id, unique, non_unique, fraction))))

    E.info("found %i kmers" % len(k.kmer2entry))
    E.info("written kmer counts for %i contigs" % total_entries)
    # write footer and output benchmark information.
    E.info("%s\n" % using("end"))

    E.Stop()