Example #1
0
    def test_split_ionfile_by_results(self):
        ion_file = "NGS/ion_file.fastq"
        blast_chunk = "NGS/_reaa.csv"

        NGS.split_ionfile_by_results(ion_file, blast_chunk)
        cmd = "grep -c '^@' NGS/_reaa.fastq"
        p = subprocess.check_output(cmd, shell=True)
        self.assertEqual(p.strip(), '1001')
        shutil.copyfile("NGS/_reaa.fastq.bak", "NGS/_reaa.fastq")
Example #2
0
    def test_parse_blast_results(self):
        # It should work using fasta files
        blast_table = os.path.join("NGS", "blast_table.csv")
        ion_file = os.path.join("NGS", "ion_file.fastq")

        NGS.parse_blast_results(blast_table, ion_file)
        result = glob.glob("output/gene*")
        self.assertEqual(len(result), 21)
        shutil.rmtree("output")
Example #3
0
    def test_split_ionfile_by_results(self):
        ion_file = os.path.join(self.cwd, "NGS/ion_file.fastq")
        blast_chunk = os.path.join(self.cwd, "NGS/_reaa.csv")

        NGS.split_ionfile_by_results(ion_file, blast_chunk)
        cmd = "grep -c '^@' " + os.path.join(self.cwd, "NGS", "_reaa.fastq")
        p = subprocess.check_output(cmd, shell=True)
        self.assertEqual(int(p.strip()), 1001)
        os.remove(os.path.join(self.cwd, "NGS", "_reaa.fastq"))
Example #4
0
 def test_filter_reads(self):
     ion_chunk = "NGS/_reaa.fastq"
     blast_chunk = "NGS/_reaa.csv"
     folder = "NGS"
     NGS.filter_reads(ion_chunk, blast_chunk, folder)
     # it should generate many gene_ files
     result = glob.glob("NGS/gene*")
     self.assertEqual(len(result), 21)
     for i in result:
         os.remove(i)
Example #5
0
    def test_prepare_data(self):
        ionfile = os.path.join(self.cwd, "NGS", "ion_file.fastq")
        NGS.prepare_data(ionfile, 8)

        expected_file1 = os.path.join("data", "modified", "wrk_ionfile.fasta")
        expected_file2 = os.path.join("data", "modified", "wrk_ionfile.fastq")
        self.assertTrue(os.path.isfile(expected_file1))
        self.assertTrue(os.path.isfile(expected_file2))
        os.remove(expected_file1)
        os.remove(expected_file2)
Example #6
0
    def test_filter_reads(self):
        folder = "NGS"

        for i in glob.glob(os.path.join("NGS", "gene*")):
            os.remove(i)

        ion_chunk = os.path.join("NGS", "reaa.fastq")
        blast_chunk = os.path.join("NGS", "reaa.csv")
        NGS.filter_reads(ion_chunk, blast_chunk, folder)

        cmd = "cat " + os.path.join("NGS", "gene*")
        cmd += " | grep -c '^@'"
        p = subprocess.check_output(cmd, shell=True)

        for i in glob.glob(os.path.join("NGS", "gene*")):
            os.remove(i)
        self.assertEqual(int(p.strip()), 23)
Example #7
0
def separate_by_index(fastq_file, index_list, folder="", levenshtein_distance=1):
    from Bio import SeqIO
    from pyphylogenomics import NGS

    '''
    This function divides FASTQ reads into bins according to a list of indexes
    (or barcodes).
    The *index_list* should be in FASTA format.
    It will compare the template indexes and those in the reads and accept
    indexes with a difference no bigger than the *levenshtein* distance (default
    1 base pair difference).

    See http://en.wikipedia.org/wiki/Levenshtein_distance

    * ``fastq_file`` FASTQ format containing reads as produced by IonTorrent
    * ``index_list`` FASTA format file containing indexes (or barcodes)
    * ``folder`` *Optional*: Directory containing FASTQ format files to process
    * ``levenshtein_distance`` *Optional*, default = 1: Maximum number of different nucleotides that will be accepted when comparing template and sequenced indexes (due to erros in base calling during sequencing). 

    Example:

    >>> from pyphylogenomics import NGS;
    >>> fastq_file = "gene_rps5.fastq";
    >>> index_list = "indexes.fasta";
    >>> folder     = "output";
    >>> NGS.separate_by_index(fastq_file, index_list, folder);

    You can also automate parsing many FASTQ files at once:

    >>> from pyphylogenomics import NGS;
    >>> import glob; # this module allow us selecting many files by using wildcards
    >>> index_list = "indexes.fasta";
    >>> folder     = "output";
    >>> for file in glob.glob("output/gene*.fastq"):
    ...     NGS.separate_by_index(file, index_list, folder);
    '''
    print "Processing file " + fastq_file;
    if folder != "":
        folder = re.sub("/$", "", folder);
        folder = os.path.abspath(folder);
        print "Output files will be written into " + folder

    for seq_record in SeqIO.parse(index_list, "fasta"):
        for fastq_record in SeqIO.parse(fastq_file, "fastq"):
            found_index = "";
            found_index = NGS.find_index_in_seq(seq_record, fastq_record, levenshtein_distance);
            if found_index == "TRUE":
                basename = os.path.basename(fastq_file);
                if folder != "":
                    filename = "index_" + str(seq_record.id) + "_" + re.sub(".fastq", "", basename) + ".fastq";
                    filename = os.path.join(folder, filename);
                else:
                    filename = "index_" + str(seq_record.id) + "_" + re.sub(".fastq", "", basename) + ".fastq";
                output_handle = open(filename, "a");
                SeqIO.write(fastq_record, output_handle, "fastq");
                output_handle.close();
Example #8
0
    def test_prune(self):
        folder = "NGS"

        blast_data = []
        f = open("NGS/blast_data.csv", "r")
        tmp = f.readlines()
        f.close()
        for i in tmp:
            blast_data.append(i.strip())

        seq_record = SeqIO.parse("NGS/seq_record.fastq", "fastq")

        ion_id = "3856"
        min_aln_length = "40"

        result = NGS.prune(folder, blast_data, seq_record, ion_id,
                            min_aln_length)
        # It should drop on seq_record from the blast_data
        self.assertEqual(len(result), 998)
Example #9
0
def filter_reads(ion_chunk, blast_chunk, folder):
    from Bio import SeqIO;
    from pyphylogenomics import NGS
    '''
    \* *Internal function* \*

    Accepting alignment lengths higher than 40 bp
    longer than our primer lengths
    '''
    min_aln_length = 40;

    blast_file = open(blast_chunk, "r");
    tmp = blast_file.readlines();
    blast_file.close();

    blast_data = []
    for i in tmp:
        blast_data.append(i.strip())
        

    # iterate over ion torrent reads
    for seq_record in SeqIO.parse(ion_chunk, "fastq"):
        if len(blast_data) > 0:
            #print "\n\nNew record--------------------"
            #print "seq record id @%s" % seq_record.id
            # avoid processing seq_records that are not in blast file
            # first id in blast_data
            #print blast_data
            first_id_in_blast_data = blast_data[0].split(",")[0]
            #print "fist id in blast_data %s" % first_id_in_blast_data

            if int(seq_record.id) >= int(first_id_in_blast_data):
                #if str(seq_record.id) == ion_id and aln_length > min_aln_length:
                if str(seq_record.id) == first_id_in_blast_data:
                    #print "prune"
                    blast_data = NGS.prune(folder, blast_data, seq_record,
                                first_id_in_blast_data, min_aln_length)
                else:
                    break
from pyphylogenomics import NGS
import sys

ionfile = sys.argv[1].strip()
index_length = 0;
NGS.prepare_data(ionfile, index_length);
from pyphylogenomics import NGS;
import sys

blast_table = sys.argv[1].strip()
ion_file    = "data/modified/wrk_ionfile.fastq";
NGS.parse_blast_results(blast_table, ion_file);