Example #1
0
def downloadFiles(infiles, outfile):

    infile = infiles
    basefile = os.path.basename(infile)
    filename = "temp_bams/%s" % basefile
    baseoutfile = os.path.basename(outfile)
    outdir = os.path.dirname(outfile)

    if infile.endswith(".remote"):
        for line in IOTools.open_file(infile):
            repo, acc = line.strip().split("\t")[:2]
            if repo == "SRA":
                if not os.path.isfile(outfile + ".1.gz"):
                    statement = "; ".join(
                        [Sra.prefetch(acc),
                         Sra.extract(acc, outdir)])
                    P.run(statement)
                else:
                    pass

            elif repo == "GDC":
                base = os.path.splitext(basefile)
                outfile = "bam.dir/" + base[0] + ".bam"

                token = glob.glob("gdc-user-token*")
                if len(token) > 0:
                    token = token[0]
                else:
                    token = None

                s, infile = Sra.process_remote_BAM(
                    infile,
                    token,
                    filename,
                    filter_bed=os.path.join(
                        PARAMS["annotations_dir"],
                        PARAMS["annotations_interface_contigs_bed"]))

                infile = " ".join(infile)
                if not os.path.isfile(outfile):
                    statement = "; ".join([
                        "mkdir -p %(filename)s", s,
                        '''cp %(infile)s %(outfile)s;
                            rm -r %(filename)s'''
                    ])
                    P.run(statement)
                else:
                    pass

            else:
                raise ValueError("Unknown repository: %s" % repo)
    else:
        pass
def assembleWithStringTie(infiles, outfile):

    infile, reference = infiles

    job_threads = PARAMS["stringtie_threads"]
    job_memory = PARAMS["stringtie_memory"]

    statement = '''stringtie %(infile)s
                           -p %(stringtie_threads)s
                           -G <(zcat %(reference)s)
                           %(stringtie_options)s
                           2> %(outfile)s.log
                   | gzip > %(outfile)s '''

    if infile.endswith(".remote"):
        token = glob.glob("gdc-user-token*")
        tmpfilename = P.getTempFilename()
        if len(token) > 0:
            token = token[0]
        else:
            token = None

        s, infile = Sra.process_remote_BAM(
            infile,
            token,
            tmpfilename,
            filter_bed=os.path.join(
                PARAMS["annotations_dir"],
                PARAMS["annotations_interface_contigs_bed"]))

        infile = " ".join(infile)
        statement = "; checkpoint ;".join(
            ["mkdir %(tmpfilename)s", s, statement, "rm -r %(tmpfilename)s"])

    P.run()
Example #3
0
def quantifyWithSalmon(infiles, outfile):
    '''Quantify existing samples against genesets'''
    job_threads = 2
    job_memory = "16G"

    infile, gtffile = infiles
    basefile = os.path.basename(infile)
    sample_name = basefile.split(os.extsep, 1)
    sorted_bam = "sorted_bams/" + sample_name[0] + "_sorted.bam"
    gtfbase = P.snip(os.path.basename(gtffile), ".gz")
    salmonIndex = "salmon_index/" + gtfbase + ".salmon.index"
    fastq1 = P.snip(outfile, "_agg-agg-agg") + ".1.fastq"
    fastq2 = P.snip(outfile, "_agg-agg-agg") + ".2.fastq"
    salmon_options = PARAMS["salmon_quantoptions"]

    statement = '''
    samtools sort -n %(infile)s -o %(sorted_bam)s;
    samtools fastq
         -1 %(fastq1)s
         -2 %(fastq2)s
         -0 /dev/null -s /dev/null -n -F 0x900
         %(sorted_bam)s; 
    salmon quant -i %(salmonIndex)s
        --libType IU
        -1 %(fastq1)s
        -2 %(fastq2)s
        -o %(outfile)s
        %(salmon_options)s; 
    mv %(outfile)s/quant.sf %(outfile)s.sf; 
    rm %(fastq1)s; rm %(fastq2)s; rm %(sorted_bam)s 
    '''

    if infile.endswith(".remote"):
        token = glob.glob("gdc-user-token*")
        filename = "temp_bams/%s" % basefile
        tmpfilename = P.get_temp_filename()
        if os.path.exists(tmpfilename):
            os.unlink(tmpfilename)

        if len(token) > 0:
            token = token[0]
        else:
            token = None

        s, infile = Sra.process_remote_BAM(
            infile,
            token,
            filename,
            filter_bed=os.path.join(
                PARAMS["annotations_dir"],
                PARAMS["annotations_interface_contigs_bed"]))

        infile = " ".join(infile)
        statement = "; ".join(
            ["mkdir %(filename)s", s, statement, "rm -r %(filename)s"])

    P.run(statement)
def assembleWithStringTie(infiles, outfile):

    infile, reference = infiles
    basefile = os.path.basename(infile)
    job_threads = PARAMS["stringtie_threads"]
    job_memory = PARAMS["stringtie_memory"]
    tmpfile = P.get_temp_filename()
    if os.path.exists(tmpfile):
        os.unlink(tmpfile)

    statement = '''
                    portcullis full 
                            -t 1
                            -o portcullis/%(basefile)s/
                            -r %(portcullis_bedref)s
                            -b 
                            %(portcullis_fastaref)s
                            %(infile)s &&
                    mv portcullis/%(basefile)s/portcullis.filtered.bam %(tmpfile)s &&
                    rm -r portcullis/%(basefile)s/ &&
                    stringtie %(tmpfile)s
                           -p %(stringtie_threads)s
                           -G <(zcat %(reference)s)
                           %(stringtie_options)s
                           2> %(outfile)s.log
                   | gzip > %(outfile)s &&
                   rm %(tmpfile)s'''

    if infile.endswith(".remote"):
        token = glob.glob("gdc-user-token*")
        tmpfilename = P.get_temp_filename()
        if os.path.exists(tmpfilename):
            os.unlink(tmpfilename)
        if len(token) > 0:
            token = token[0]
        else:
            token = None

        s, infile = Sra.process_remote_BAM(
            infile,
            token,
            tmpfilename,
            filter_bed=os.path.join(
                PARAMS["annotations_dir"],
                PARAMS["annotations_interface_contigs_bed"]))

        infile = " ".join(infile)
        statement = "; ".join([
            "mkdir -p %(tmpfilename)s", s, statement, "rm -r %(tmpfilename)s"
        ])

    P.run(statement)
Example #5
0
def quantifyWithSalmon(infiles, outfile):
    '''Quantify existing samples against genesets'''
    job_threads = 2
    job_memory = "8G"

    infile, gtffile = infiles
    basefile = os.path.basename(infile)
    gtfbase = P.snip(os.path.basename(gtffile), ".gz")
    salmonIndex = "salmon_index/" + gtfbase + ".salmon.index"
    fastq1 = P.snip(outfile, "_agg-agg-agg") + ".1.fastq"
    fastq2 = P.snip(outfile, "_agg-agg-agg") + ".2.fastq"
    salmon_options = PARAMS["salmon_quantoptions"]

    statement = '''
    samtools fastq
         -1 %(fastq1)s
         -2 %(fastq2)s
         %(infile)s; 
    salmon quant -i %(salmonIndex)s
        --libType A
        -1 %(fastq1)s
        -2 %(fastq2)s
        -o %(outfile)s
        --threads %(job_threads)s
        %(salmon_options)s; 
    checkpoint; 
    mv %(outfile)s/quant.sf %(outfile)s.sf; 
    rm %(fastq1)s; rm %(fastq2)s 
    '''

    if infile.endswith(".remote"):
        token = glob.glob("gdc-user-token*")
        filename = "temp_bams/%s" % basefile
        tmpfilename = P.getTempFilename()
        if len(token) > 0:
            token = token[0]
        else:
            token = None

        s, infile = Sra.process_remote_BAM(
            infile,
            token,
            filename,
            filter_bed=os.path.join(
                PARAMS["annotations_dir"],
                PARAMS["annotations_interface_contigs_bed"]))

        infile = " ".join(infile)
        statement = "; checkpoint; ".join(
            ["mkdir %(filename)s", s, statement, "rm -r %(filename)s"])

    P.run()
def makeAdaptorFasta(infile, outfile, track, dbh, contaminants_file):
    '''Generate a .fasta file of adaptor sequences that are
    overrepresented in the reads from a sample.

    Requires cutadapt >= 1.7.

    Arguments
    ---------
    infile : string
        Input filename that has been QC'ed. The filename is used to
        check if the input was a :term:`sra` file and guess the
        number of tracks to check.
    outfile : string
        Output filename in :term:`fasta` format.
    track : string
        Track name, used to access FastQC results in database.
    dbh : object
        Database handle.
    contaminants_file : string
        Path of file containing contaminants used for screening by
        Fastqc.

    '''
    tracks = [track]

    if infile.endswith(".sra"):
        # patch for SRA files, look at multiple tracks
        f, fastq_format = Sra.peek(infile)
        if len(f) == 2:
            tracks = [track + "_fastq_1", track + "_fastq_2"]

    found_contaminants = []
    for t in tracks:
        table = PipelineTracks.AutoSample(os.path.basename(t)).asTable()

        query = '''SELECT Possible_Source, Sequence FROM
        %s_fastqc_Overrepresented_sequences;''' % table

        cc = dbh.cursor()
        try:
            found_contaminants.extend(cc.execute(query).fetchall())
        except sqlite3.OperationalError, msg:
            print msg
            # empty table
            continue
def runKalistoOnRemoteBAM(infiles, outfile):
    '''running kalisto on .bam or .remote files'''

    job_memory="6G"
    infile, kallisto_index = infiles

    outdir = os.path.dirname(outfile)
    outfile = P.snip(outfile, ".gz")

    statement = []
    

    tempfastq1 = P.getTempFilename()
    tempfastq2 = P.getTempFilename()
    rm_files = [tempfastq1, tempfastq2]
    
    
    
    if infile.endswith(".remote"):
            tempbam = P.getTempFilename()
	    s, infiles = Sra.process_remote_BAM(infile, outdir=tempbam)
    	    infile = " ".join(infiles)
            statement.append(s)
            rm_files.append(tempbam)


    statement.append('''samtools fastq %(infile)s 
                                      -1 %(tempfastq1)s 
                                      -2 %(tempfastq2)s''')

    statement.append('''kallisto quant -i %(kallisto_index)s
                                       -o %(outdir)s
                                       %(tempfastq1)s %(tempfastq2)s''')
    rm_files = " ".join(rm_files)
    statement.append('''rm -R %(rm_files)s''')
    statement.append('''gzip %(outfile)s''')
    statement = "; \n checkpoint;\n".join(statement)

    P.run()
def downloadSequinsNeatData(outfile):
    ''' Download the neat Sequins data from NCBI'''

    address_base = 'ftp://ftp-trace.ncbi.nlm.nih.gov/sra/sra-instant/reads/ByExp/sra/SRX/SRX189'

    outfile2srr = {
        'neat-A.fastq.1.gz': 'SRR3743147',
        'neat-B.fastq.1.gz': 'SRR3743148'
    }

    srr2srx = {'SRR3743147': 'SRX1897294', 'SRR3743148': 'SRX1897295'}

    outfile_base = os.path.basename(outfile)

    srr = outfile2srr[outfile_base]
    srx = srr2srx[srr]

    outfile_name = P.snip(outfile_base, '.fastq.1.gz')

    statement = '''
    wget %(address_base)s/%(srx)s/%(srr)s/%(srr)s.sra
    -O %(outfile_name)s.sra
    '''
    P.run()

    outdir = os.path.dirname(outfile)
    statement = Sra.extract(outfile_name + '.sra', outdir)
    P.run()

    statement = '''
    mv %(outdir)s/%(outfile_name)s_1.fastq.gz
    %(outdir)s/%(outfile_name)s.fastq.1.gz; checkpoint;
    mv %(outdir)s/%(outfile_name)s_2.fastq.gz
    %(outdir)s/%(outfile_name)s.fastq.2.gz'''
    P.run()

    os.unlink(outfile_name + '.sra')
Example #9
0
def extractGSE65525(infile, outfile):
    ''' extract fastqs '''
    statement = SRA.extract(infile, "GSE65525/fastqs.dir")

    P.run()
Example #10
0
def extractGGSE53638(infile, outfile):
    ''' extract the fastqs from the SRA '''

    statement = SRA.extract(infile, "GSE53638/fastqs.dir")

    P.run()
Example #11
0
def makeAdaptorFasta(infile, outfile, track, dbh, contaminants_file):
    '''Generate a .fasta file of adaptor sequences that are
    overrepresented in the reads from a sample.

    Requires cutadapt >= 1.7.

    Arguments
    ---------
    infile : string
        Input filename that has been QC'ed. The filename is used to
        check if the input was a :term:`sra` file and guess the
        number of tracks to check.
    outfile : string
        Output filename in :term:`fasta` format.
    track : string
        Track name, used to access FastQC results in database.
    dbh : object
        Database handle.
    contaminants_file : string
        Path of file containing contaminants used for screening by
        Fastqc.

    '''
    tracks = [track]

    if infile.endswith(".sra"):
        # patch for SRA files, look at multiple tracks
        f, fastq_format, datatype = Sra.peek(infile)
        if len(f) == 2:
            tracks = [track + "_fastq_1", track + "_fastq_2"]
    elif infile.endswith(".fastq.1.gz"):
        tracks = [track + "_fastq_1", track + "_fastq_2"]
    elif infile.endswith(".fastq.gz"):
        tracks = [track]

    found_contaminants = []

    for t in tracks:
        table = PipelineTracks.AutoSample(os.path.basename(t)).asTable()

        # if sample name starts with a number, sql table will have
        # prepended "_"
        if re.match("^\d+.*", table):
            table = "_" + table

        query = '''SELECT Possible_Source, Sequence FROM
        %s_fastqc_Overrepresented_sequences;''' % table

        cc = dbh.cursor()

        # if there is no contamination table for even a single sample
        # it will prevent the whole pipeline progressing
        try:
            found_contaminants.extend(cc.execute(query).fetchall())
        except sqlite3.OperationalError:
            E.warn("No table found for {}".format(t))

    if len(found_contaminants) == 0:
        P.touch(outfile)
        return

    # read contaminants from existing file
    with IOTools.openFile(contaminants_file, "r") as inf:
        known_contaminants = [l.split() for l in inf
                              if not l.startswith("#") and l.strip()]
        known_contaminants = {" ".join(x[:-1]): x[-1]
                              for x in known_contaminants}

    # output the full sequence of the contaminant if found
    # in the list of known contaminants, otherwise don't report!

    matched_contaminants = set()
    with IOTools.openFile(outfile, "w") as outf:
        for found_source, found_seq in found_contaminants:
            possible_source = found_source.split(" (")[0]

            if possible_source in known_contaminants:
                matched_contaminants.update((possible_source,))
            else:
                pass

        if len(matched_contaminants) > 0:
            for match in matched_contaminants:
                outf.write(">%s\n%s\n" % (match.replace(" ,", ""),
                                          known_contaminants[match]))
def makeAdaptorFasta(infile, outfile, track, dbh, contaminants_file):
    '''Generate a .fasta file of adaptor sequences that are
    overrepresented in the reads from a sample.

    Requires cutadapt >= 1.7.

    Arguments
    ---------
    infile : string
        Input filename that has been QC'ed. The filename is used to
        check if the input was a :term:`sra` file and guess the
        number of tracks to check.
    outfile : string
        Output filename in :term:`fasta` format.
    track : string
        Track name, used to access FastQC results in database.
    dbh : object
        Database handle.
    contaminants_file : string
        Path of file containing contaminants used for screening by
        Fastqc.

    '''
    tracks = [track]

    if infile.endswith(".sra"):
        # patch for SRA files, look at multiple tracks
        f, fastq_format, datatype = Sra.peek(infile)
        if len(f) == 2:
            tracks = [track + "_fastq_1", track + "_fastq_2"]
    elif infile.endswith(".fastq.1.gz"):
        tracks = [track + "_fastq_1", track + "_fastq_2"]
    elif infile.endswith(".fastq.gz"):
        tracks = [track]

    found_contaminants = []

    for t in tracks:
        table = PipelineTracks.AutoSample(os.path.basename(t)).asTable()

        # if sample name starts with a number, sql table will have
        # prepended "_"
        if re.match("^\d+.*", table):
            table = "_" + table

        query = '''SELECT Possible_Source, Sequence FROM
        %s_fastqc_Overrepresented_sequences;''' % table

        cc = dbh.cursor()

        # if there is no contamination table for even a single sample
        # it will prevent the whole pipeline progressing
        try:
            found_contaminants.extend(cc.execute(query).fetchall())
        except sqlite3.OperationalError:
            E.warn("No table found for {}".format(t))

    if len(found_contaminants) == 0:
        P.touch(outfile)
        return

    # read contaminants from existing file
    with IOTools.openFile(contaminants_file, "r") as inf:
        known_contaminants = [l.split() for l in inf
                              if not l.startswith("#") and l.strip()]
        known_contaminants = {" ".join(x[:-1]): x[-1]
                              for x in known_contaminants}

    # output the full sequence of the contaminant if found
    # in the list of known contaminants, otherwise don't report!

    matched_contaminants = set()
    with IOTools.openFile(outfile, "w") as outf:
        for found_source, found_seq in found_contaminants:
            possible_source = found_source.split(" (")[0]

            if possible_source in known_contaminants:
                matched_contaminants.update((possible_source,))
            else:
                pass

        if len(matched_contaminants) > 0:
            for match in matched_contaminants:
                outf.write(">%s\n%s\n" % (match.replace(" ,", ""),
                                          known_contaminants[match]))
def extractGSE65525(infile, outfile):
    ''' extract fastqs '''
    statement = SRA.extract(infile, "GSE65525/fastqs.dir")

    P.run()
def extractGGSE53638(infile, outfile):
    ''' extract the fastqs from the SRA '''

    statement = SRA.extract(infile, "GSE53638/fastqs.dir")

    P.run()