Example #1
0
def index(args):
    
    if not os.path.exists(args.indexdir):
        print "Index_dir %s does not exist!" % (args.indexdir)
        sys.exit(1)

    fasta_dir = args.fastadir
    index_dir = os.path.join(args.indexdir, args.indexname)

    g = GenomeIndex()
    g.create_index(fasta_dir, index_dir)

    # Create genome FASTA file for use with bedtools
    with open(os.path.join(index_dir, "genome.fa"), 'w') as out:
        for f in find_by_ext(fasta_dir, FASTA_EXT):
            for line in open(f):
                out.write(line)

    test_chr = g.get_chromosomes()[0]
    tmp = NamedTemporaryFile()
    tmp.write("{}\t1\t2\n".format(test_chr))
    tmp.flush()
    
    b = pybedtools.BedTool(tmp.name)
    try:
        b.nucleotide_content(fi=os.path.join(index_dir, "genome.fa"))
    except pybedtools.helpers.BEDToolsError as e:
        if str(e).find("generating") == -1:
            raise
Example #2
0
def create_bedtools_fa(index_dir, fasta_dir):
    g = GenomeIndex(index_dir)

    genome_fa = os.path.join(index_dir, "genome.fa")
    # Create genome FASTA file for use with bedtools
    with open(genome_fa, 'w') as out:
        for fname in find_by_ext(fasta_dir, FASTA_EXT):
            with open(fname) as f:
                for line in f:
                    out.write(line)

    # Delete old bedtools index if it exists, otherwise bedtools will
    # give an error.
    if os.path.exists(genome_fa + ".fai"):
        os.unlink(genome_fa + ".fai")
    
    test_chr = g.get_chromosomes()[0]
    tmp = NamedTemporaryFile(mode="w")
    tmp.write("{}\t1\t2\n".format(test_chr))
    tmp.flush()

    b = pybedtools.BedTool(tmp.name)
    try:
        # pylint: disable=unexpected-keyword-arg
        b.nucleotide_content(fi=genome_fa)
    except pybedtools.helpers.BEDToolsError as e:
        if str(e).find("generating") == -1:
            raise
Example #3
0
def create_bedtools_fa(index_dir, fasta_dir):
    g = GenomeIndex(index_dir)

    # Create genome FASTA file for use with bedtools
    with open(os.path.join(index_dir, "genome.fa"), 'w') as out:
        for f in find_by_ext(fasta_dir, FASTA_EXT):
            for line in open(f):
                out.write(line)

    test_chr = g.get_chromosomes()[0]
    tmp = NamedTemporaryFile()
    tmp.write("{}\t1\t2\n".format(test_chr))
    tmp.flush()

    b = pybedtools.BedTool(tmp.name)
    try:
        b.nucleotide_content(fi=os.path.join(index_dir, "genome.fa"))
    except pybedtools.helpers.BEDToolsError as e:
        if str(e).find("generating") == -1:
            raise
Example #4
0
def create_bedtools_fa(index_dir, fasta_dir):
    g = GenomeIndex(index_dir)

    # Create genome FASTA file for use with bedtools
    with open(os.path.join(index_dir, "genome.fa"), 'w') as out:
        for f in find_by_ext(fasta_dir, FASTA_EXT):
            for line in open(f):
                out.write(line)

    test_chr = g.get_chromosomes()[0]
    tmp = NamedTemporaryFile()
    tmp.write("{}\t1\t2\n".format(test_chr))
    tmp.flush()

    b = pybedtools.BedTool(tmp.name)
    try:
        b.nucleotide_content(fi=os.path.join(index_dir, "genome.fa"))
    except pybedtools.helpers.BEDToolsError as e:
        if str(e).find("generating") == -1:
            raise
Example #5
0
class GenomeIndex:
    """ Index fasta-formatted files for faster retrieval of sequences
        Typical use:
        
        # Make index
        g = GenomeIndex()
        g.create_index("/usr/share/genomes/hg18", "/usr/share/genome_index/hg18")

        # Retrieve sequence
        g = GenomeIndex("/usr/share/genome_index/hg18")
        seq = g.get_sequence("chr17", "7520037", "7531588")

        # Batch bed-file to fasta-file
        track2fasta("/usr/share/genome_index/hg18", "p53_targets.bed", "p53_targets.fa")
    """
    def __init__(self, index_dir=None):
        """ Initialize GenomeIndex with index_dir as optional argument"""
        self.param_file = "index.params"
        self.size_file = "genome.size"
        self.index_dir = index_dir
        self.fasta_dir = None

        self.size = {}
        self.fasta_file = {}
        self.index_file = {}
        self.line_size = {}
        self.pack_char = "L"

        if self.index_dir:
            if os.path.exists(os.path.join(self.index_dir, self.param_file)):
                self._read_index_file()

    def _check_dir(self, dir):
        """ Check if dir exists, if not: give warning and die"""
        if not os.path.exists(dir):
            print "Directory %s does not exist!" % dir
            sys.exit(1)

    def _make_index(self, fasta, index):
        """ Index a single, one-sequence fasta-file"""
        out = open(index, "w")
        f = open(fasta)
        # Skip first line of fasta-file
        line = f.readline()
        offset = f.tell()
        line = f.readline()
        while line:
            out.write(pack(self.pack_char, offset))
            offset = f.tell()
            line = f.readline()
        f.close()
        out.close()

    def create_index(self, fasta_dir=None, index_dir=None):
        """Index all fasta-files in fasta_dir (one sequence per file!) and
        store the results in index_dir"""

        # Use default directories if they are not supplied
        if not fasta_dir:
            fasta_dir = self.fasta_dir

        if not index_dir:
            index_dir = self.index_dir

        # Can't continue if we still don't have an index_dir or fasta_dir
        if not fasta_dir:
            print "fasta_dir not defined!"
            sys.exit(1)

        if not index_dir:
            print "index_dir not defined!"
            sys.exit(1)

        index_dir = os.path.abspath(index_dir)
        fasta_dir = os.path.abspath(fasta_dir)

        self.index_dir = index_dir

        # Prepare index directory
        if not os.path.exists(index_dir):
            try:
                os.mkdir(index_dir)
            except OSError, e:
                if e.args[0] == 13:
                    sys.stderr.write(
                        "No permission to create index directory. Superuser access needed?\n"
                    )
                    sys.exit()
                else:
                    sys.stderr.write(e)

        # Directories need to exist
        self._check_dir(fasta_dir)
        self._check_dir(index_dir)

        # Get all fasta-files

        fastafiles = find_by_ext(fasta_dir, FASTA_EXT)
        if not (fastafiles):
            raise IOError, \
                    "No fastafiles found in {} with extension in {}".format(
                    fasta_dir, ",".join(FASTA_EXT))

        # param_file will hold all the information about the location of the fasta-files, indeces and
        # length of the sequences
        param_file = os.path.join(index_dir, self.param_file)
        size_file = os.path.join(index_dir, self.size_file)

        try:
            out = open(param_file, "w")
        except IOError, e:
            if e.args[0] == 13:
                sys.stderr.write(
                    "No permission to create files in index directory. Superuser access needed?\n"
                )
                sys.exit()
            else:
                sys.stderr.write(e)
Example #6
0
    def create_index(self,fasta_dir=None, index_dir=None):
        """Index all fasta-files in fasta_dir (one sequence per file!) and
        store the results in index_dir"""
        
        # Use default directories if they are not supplied
        if not fasta_dir:
            fasta_dir = self.fasta_dir

        if not index_dir:
            index_dir = self.index_dir

        # Can't continue if we still don't have an index_dir or fasta_dir
        if not fasta_dir:
            print("fasta_dir not defined!")
            sys.exit(1)
        
        if not index_dir:
            print("index_dir not defined!")
            sys.exit(1)
        
        index_dir = os.path.abspath(index_dir)
        fasta_dir = os.path.abspath(fasta_dir)

        self.index_dir = index_dir

        # Prepare index directory
        if not os.path.exists(index_dir):
            try:
                os.mkdir(index_dir)
            except OSError as e:
                if e.args[0] == 13:
                    sys.stderr.write("No permission to create index directory. Superuser access needed?\n")
                    sys.exit()
                else:
                    sys.stderr.write(e)

        # Directories need to exist
        self._check_dir(fasta_dir)
        self._check_dir(index_dir)

        # Get all fasta-files 

        fastafiles = find_by_ext(fasta_dir, FASTA_EXT)
        if not(fastafiles):
            msg = "No fastafiles found in {} with extension in {}".format(
                                        fasta_dir, ",".join(FASTA_EXT))
            raise IOError(msg)

        # param_file will hold all the information about the location of the fasta-files, indeces and 
        # length of the sequences
        param_file = os.path.join(index_dir, self.param_file)
        size_file = os.path.join(index_dir, self.size_file)
        
        try:
            out = open(param_file, "w")
        except IOError as e:
            if e.args[0] == 13:
                sys.stderr.write("No permission to create files in index directory. Superuser access needed?\n")
                sys.exit()
            else:
                sys.stderr.write(e)
        s_out = open(size_file, "w")

        for fasta_file in fastafiles:
            #sys.stderr.write("Indexing %s\n" % fasta_file)
            f = open(fasta_file)
            line = f.readline()
            if not line.startswith(">"):
                sys.stderr.write("%s is not a valid FASTA file, expected > at first line\n" % fasta_file)
                sys.exit()
            
            seqname = line.strip().replace(">", "")
            line = f.readline()
            line_size = len(line.strip())

            total_size = 0 
            while line:
                line = line.strip()
                if line.startswith(">"):
                    sys.stderr.write("Sorry, can only index genomes with "
                    "one sequence per FASTA file\n%s contains multiple "
                    "sequences\n" % fasta_file)
                    sys.exit()
                
                total_size += len(line)
                line = f.readline()

            index_file = os.path.join(index_dir, "%s.index" % seqname)

            out.write("{}\t{}\t{}\t{}\t{}\n".format(
                seqname, fasta_file, index_file, line_size, total_size))
            s_out.write("{}\t{}\n".format(seqname, total_size))
            
            self._make_index(fasta_file, index_file)
            f.close()
        out.close()
        s_out.close()

        # Read the index we just made so we can immediately use it
        self._read_index_file()