예제 #1
0
def prepare_vari(workdir, snpEff, snpSift, email, genome, genes):
    """
    Prepare files for running provean, each folder should only have vcf and vcf.idx file
    * workdir: the folder that has vcf files
    * snpEff: path to snpEff
    * snpSift: path to snpSift
    * email: email or phone number ([email protected])
    * genome: genome name defined in snpEff
    * genes: A list of gene symbols
    """

    vcfFiles = glob.glob(workdir + '/*.filter.vcf')
    vcfFile = vcfFiles[0]
    #============= 1. Annotate vcf results using snpEff ================
    annotatedVCF = vcfFile[:-3] + 'eff.vcf'
    if not os.path.exists(annotatedVCF):
        annotatedVCF = snpEff_annotateVCF(
            vcfFile, snpEff, genome)  # annotated: filename.eff.vcf
    #============= 2. Loop for every gene ================================
    for gene in genes:
        print gene, 'start to get input files for provean'
        if gene == '':
            try:
                filteredVCF = snpSift_filterVCF(annotatedVCF, snpSift, [
                    '((ANN[*].IMPACT=\'HIGH\') | (ANN[*].IMPACT=\'MODERATE\'))'
                ])
            except:
                print gene, 'snpSift filter failed'
                Message('snpSift filter failed', email)
        else:
            gene_if = ('(ANN[*].GENE=\'{gene}\')').format(gene=gene)
            #============= (1). Filter the annotated file ========================
            try:
                filteredVCF = snpSift_filterVCF(annotatedVCF, snpSift, [
                    gene_if, '&'
                    '((ANN[*].IMPACT=\'HIGH\') | (ANN[*].IMPACT=\'MODERATE\'))'
                ])
                print 'filteredVCF is: ', filteredVCF
            except:
                print gene, 'snpSift filter failed'
                Message('snpSift filter failed', email)
        #============= (2). Get input files for provean ======================
        try:
            vari_files = vari_input4provean(filteredVCF)
        except:
            print gene, 'fail to get provean inputs'
            Message('fail to get provean inputs', email)
            raise
        if vari_files == '':
            print gene, 'does not have any interested variants'
    print workdir, 'provean input succeed'
예제 #2
0
file_path = param['filePath']
kallisto_index = param['kallisto_index']

trim = param['trim']
phred = param['phred']
trimmomatic = param['trimmomatic']
trimmoAdapter = param['trimmoAdapter']


#===============================================================================
#                 Pipeline
#===============================================================================
#=========== (0) enter the directory ================

Message(startMessage,email)

os.chdir(file_path)
#=========== (1) reads files and trim ====================
fastqFiles = list_files(file_path)
print 'list file succeeded'
if trim == 'True':
    try:
        fastqFiles = Trimmomatic(trimmomatic,fastqFiles,phred,trimmoAdapter,batch=6)
        print 'trim succeed'
        print 'fastqFiles is: ',fastqFiles
    except:
        print 'trim failed'
        Message('trim failed',email)
        raise
#=========== (2) run sailfish mapping ====================
예제 #3
0
ref_fa = param['refSequence']
file_path = param['filePath']
bwaIndex = param['alignerDb']
trim = param['trim']
phred = param['phred']
picard = param['picard']
trimmomatic = param['trimmomatic']
trimmoAdapter = param['trimmoAdapter']

gatk = param['gatk']
read_group = param['readGroup']
organism = param['organism']
##*****************  Part 0. Build index file for bwa and GATK ******
##=================  Part I. Preprocess  ============================
#========  1. map and dedupping =====================================
Message(startMessage, email)
#========  (0) enter the directory ========================
bwa_path = bwaIndex[:bwaIndex.rfind('/')]
if not os.path.exists(bwa_path): os.mkdir(bwa_path)
if os.listdir(bwa_path) == []:
    bwa_Db(bwa_path, ref_fa)
os.chdir(file_path)
#========  (1) read files  ================================
fastqFiles = list_files(file_path)
if trim == 'True':
    trim_fastqFiles = Trimmomatic(trimmomatic,
                                  fastqFiles,
                                  phred,
                                  trimmoAdapter,
                                  batch=6)
    remove(fastqFiles)
예제 #4
0
trimmoAdapter = param['trimmoAdapter']
#-------------  specific parameters for gsnap  ------------
host_gsnapDbName = param['host_gsnapDbName']
host_gsnapAnnotation = param['host_gsnapAnnotation']
virus_gsnapDbName = param['virus_gsnapDbName']
virus_gsnapAnnotation = param['virus_gsnapAnnotation']
#------------- blast parameters ---------------------------
blast_Db = param['blast_Db']
runBlastAfterAssemble = param['runBlastAfterAssemble']
blast_nt_DB = param['blast_nt_DB']
#===============================================================================
#          1. map to host reference genome
#===============================================================================
#========  (0) enter the directory ========================
os.chdir(file_path)
Message(startMessage,email)
"""
#========  (1) read files  ================================
fastqFiles = list_files(file_path)
if trim == 'True':
    fastqFiles = Trimmomatic(trimmomatic,fastqFiles,phred)
print 'list file succeed'
print 'fastqFiles is: ',fastqFiles
#========  (2) align fastq files to host ================================
try:
    if aligner == 'gsnap':
        # check index
        if os.listdir(host_alignerDb) == []:
            gsnap_Db(host_fa,host_alignerDb,host_gsnapDbName,host_gsnapAnnotation)
        map_files = gsnap(fastqFiles,host_alignerDb, host_gsnapDbName,host_gsnapAnnotation,thread) # [file.sam]
    else:
endMessage = param['endMessage']
# database reference
fastaFile = param['reference']
record_dict = SeqIO.index(fastaFile, 'fasta')
gffFile = param['annotation']
genome = param['genome']
CodonFile = param['CodonFile']
# software parameters
snpSift = param['snpSift']
snpEff = param['snpEff']
provean = param['provean']
support_set_path = param['support_set']
# other parameters
gene_file = param['gene_file']

Message(startMessage, email)

#===============================================================================
#        Variant analysis pipeline
#===============================================================================
## read gene list
if gene_file == '':
    genes = ['']
else:
    genes = []
    geneFile = open(gene_file, 'r')
    for line in geneFile:
        if line[0].isalpha():
            genes.append(line[:-1])
    geneFile.close()
    genes = list(set(genes))
예제 #6
0
file_path = param['filePath']
starDb = param['alignerDb']
trim = param['trim']
phred = param['phred']

picard = param['picard']
trimmomatic = param['trimmomatic']
trimmoAdapter = param['trimmoAdapter']
gatk = param['gatk']
read_group = param['readGroup']
organism = param['organism']

##*****************  Part 0. Build index file for bwa and GATK ******
##*****************  Part I. Preprocess  ============================
#========  1. map and dedupping =====================================
#========  (0) enter the directory ========================
os.chdir(file_path)
Message(startMessage, email)
#========  (1) read files  ================================
fastqFiles = list_files(file_path)
if trim == 'True':
    trim_fastqFiles = Trimmomatic(trimmomatic,
                                  fastqFiles,
                                  phred,
                                  trimmoAdapter,
                                  batch=6)
    remove(fastqFiles)
else:
    trim_fastqFiles = fastqFiles
sys.stdout.write('list file succeed\n')
sys.stdout.write('fastqFiles is: {fq}\n'.format(fq=trim_fastqFiles))
예제 #7
0
gold_snp = param['dbSNP']
phaseINDEL = param['phase1INDEL']
gold_indel = param['MillINDEL']
omni = param['omni']
hapmap = param['hapMap']

gatk = param['gatk']
read_group = param['readGroup']
organism = param['organism']

##*****************  Part 0. Build index file for bwa and GATK ******
##*****************  Part I. Preprocess  ============================
#========  1. map and dedupping =====================================
#========  (0) enter the directory ========================
os.chdir(file_path)
Message(startMessage, email)
#========  (1) read files  ================================
fastqFiles = list_files_human(file_path)
if trim == 'True':
    fastqFiles = Trimmomatic(trimmomatic, fastqFiles, phred, trimmoAdapter)
sys.stdout.write('list file succeed\n')
sys.stdout.write('fastqFiles is: {fq}\n'.format(fq=fastqFiles))

#========  (2) align using 2 pass STAR ====================
try:
    map_sams = STAR2Pass(fastqFiles, starDb, ref_fa, thread)
    sys.stdout.write('align succeed\n')
    sys.stdout.write('map_sams is: {map}\n'.format(map=map_sams))
except:
    sys.stdout.write('align failed\n')
    Message('align failed', email)
예제 #8
0
alignerrRNADb = param['alignerrRNADb']
alignerDb = param['alignerDb']

trim = param['trim']
phred = param['phred']
trimmomatic = param['trimmomatic']
trimmoAdapter = param['trimmoAdapter']

annotation = param['annotation']

##*****************  Part 0. Build index file for bowtie ******
##=================  Part I. Preprocess  ============================
#========  1. map reads ==================================
#========  (0) enter the directory =======================
os.chdir(file_path)
Message(startMessage, email)
#========  (1) read and trim files  ======================
fastqFiles = list_files(file_path)
if trim == 'True':
    fastqFiles = Trimmomatic(trimmomatic, fastqFiles, phred,
                             batch=6)  # [[filename.fq.gz]]
print 'list file succeed'
print 'fastqFiles is: ', fastqFiles
#========  (2) align to rRNA =============================
try:
    noRNA_fqs = bowtie2(fastqFiles,
                        alignerrRNADb,
                        thread,
                        otherParameters=['--un-gz'
                                         ])  # [[filename.norna.fq.gz]]
    print 'extract norRNA succeed'
예제 #9
0
trim = param['trim']
phred = param['phred']
trimmomatic = param['trimmomatic']
trimmoAdapter = param['trimmoAdapter']

aligner = param['aligner']
annotation = param['annotation']
output_path = param['htseqOutPath']
db_name = param['gsnapDbName']
gsnap_annotation = param['gsnapAnnotation']

Dict = param['symbolIDFile']
inputpath = file_path

#=========== (0) enter the directory ================
Message(startMessage, email)
os.chdir(file_path)
#=========== (1) reads files and trim ===============
fastqFiles = list_files(file_path)
if trim == 'True':
    fastqFiles = Trimmomatic(trimmomatic, fastqFiles, phred, trimmoAdapter)
print 'list file succeed'
#=========== (2) run gsnap to do the mapping ========
if aligner == 'gsnap':
    map_files = gsnap(fastqFiles, db_path, db_name, gsnap_annotation, thread)
else:
    map_files = STAR(fastqFiles, db_path, thread)
print 'align succeed'
#=========== (3) samtools to sort the file ==========
sorted_bam = sam2bam_sort(map_files, thread)
print 'sorted succeed'
예제 #10
0
organism = param['organism']
#-------------  specific parameters for gsnap  ------------
host_gsnapDbName = param['host_gsnapDbName']
host_gsnapAnnotation = param['host_gsnapAnnotation']
virus_gsnapDbName = param['virus_gsnapDbName']
virus_gsnapAnnotation = param['virus_gsnapAnnotation']
#-------------  htseqCount parameters  --------------------
host_htseqFolder = param['host_htseqOutputFolder']
virus_htseqFolder = param['virus_htseqOutputFolder']

#===============================================================================
#          1. map to host reference genome
#===============================================================================
#========  (0) enter the directory ========================
os.chdir(file_path)
Message(startMessage,email)
#========  (1) read files  ================================
fastqFiles = list_files(file_path)
if trim == 'True':
    fastqFiles = Trimmomatic(trimmomatic,fastqFiles,phred)     # [[fq.gz]]
print 'list file succeed'
print 'fastqFiles is: ',fastqFiles
#========  (2) align fastq files to host ================================
try:
    if aligner == 'gsnap':
        map_files = gsnap(fastqFiles,host_alignerDb, host_gsnapDbName,host_gsnapAnnotation,thread)
    else:
        map_files = STAR(fastqFiles,host_alignerDb,thread)
    print 'host align succeed'
    print 'map_files is: ',map_files                          # [file.sam]
except:
예제 #11
0
def prepare_fa_vari(workdir, snpEff, snpSift, email, genome, genes,
                    record_dict, gffFile):
    """
    Prepare files for running provean, each folder should only have vcf and vcf.idx file
    * workdir: the folder that has vcf files
    * snpEff: path to snpEff
    * snpSift: path to snpSift
    * email: email or phone number ([email protected])
    * genome: genome name defined in snpEff
    * genes: A list of gene symbols
    * record_dict: 
    """
    gene_rna_lst = [
        f[:-11] for f in os.listdir(workdir) if f.endswith('protein.fa')
    ]

    os.chdir(workdir)  # set work directory
    vcfFiles = [f for f in os.listdir(workdir) if f.endswith('filter.vcf')]
    vcfFile = vcfFiles[0]
    proteinFiles = []
    variantFiles = []
    #============= 1. Annotate vcf results using snpEff ================
    annotatedVCF = vcfFile[:-3] + 'eff.vcf'
    if not os.path.exists(workdir + '/' + annotatedVCF):
        annotatedVCF = snpEff_annotateVCF(
            vcfFile, snpEff, genome)  # annotated: filename.eff.vcf
    #============= 2. Loop for every genes ================================
    for gene in genes:
        print gene, 'start to get input files for provean'
        if gene == '':
            try:
                filteredVCF = snpSift_filterVCF(annotatedVCF, snpSift, [
                    '((ANN[*].IMPACT=\'HIGH\') | (ANN[*].IMPACT=\'MODERATE\'))'
                ])
            except:
                print gene, 'snpSift filter failed'
                Message('snpSift filter failed', email)
        else:
            gene_if = ('(ANN[*].GENE=\'{gene}\')').format(gene=gene)
            #============= (1). Filter the annotated file ========================
            try:
                filteredVCF = snpSift_filterVCF(annotatedVCF, snpSift, [
                    gene_if, '&'
                    '((ANN[*].IMPACT=\'HIGH\') | (ANN[*].IMPACT=\'MODERATE\'))'
                ])
                print 'filteredVCF is: ', filteredVCF
            except:
                print gene, 'snpSift filter failed'
                Message('snpSift filter failed', email)
        #============= (2). Get input files for provean ======================
        try:
            [protein_files,
             variant_files] = vcf2input4provean(filteredVCF, record_dict,
                                                gffFile, gene_rna_lst)
        except:
            print gene, 'fail to get provean inputs'
            Message('fail to get provean inputs', email)
            raise
        if protein_files != '':
            proteinFiles.extend(protein_files)
            variantFiles.extend(variant_files)
            print gene, 'prepare for provean input finish'
        else:
            print gene, 'does not have interested variants'
            raise
    print workdir, 'provean input succeed'
예제 #12
0
                                                gffFile, gene_rna_lst)
        except:
            print gene, 'fail to get provean inputs'
            Message('fail to get provean inputs', email)
            raise
        if protein_files != '':
            proteinFiles.extend(protein_files)
            variantFiles.extend(variant_files)
            print gene, 'prepare for provean input finish'
        else:
            print gene, 'does not have interested variants'
            raise
    print workdir, 'provean input succeed'


Message(startMessage, email)
genes = get_genes_from_file(gene_file)
#================= 0. list directories =========================================
os.chdir(pathway)  # set work directory
folders = get_all_folders(pathway)
folders = natsorted(folders)
#============= 2. prepare input files for provean ======================================
batch_folders = chunk(folders, int(thread))
for batch in batch_folders:
    proc = [
        Process(target=prepare_fa_vari,
                args=(
                    pathway + '/' + f,
                    snpEff,
                    snpSift,
                    email,
예제 #13
0
#-------------  software parameters  ----------------------
file_path = param['filePath']
aligner = param['aligner']
alignerDb = param['alignerDb']
trim = param['trim']
phred = param['phred']

picard = param['picard']
trimmomatic = param['trimmomatic']
trimmoAdapter = param['trimmoAdapter']
#-------------  specific parameters for gsnap  ------------
gsnapDbName = param['gsnapDbName']
gsnapAnnotation = param['gsnapAnnotation']
#========  (0) enter the directory ================
os.chdir(file_path)
Message(startMessage, email)
#========  (1) read files  ================================
fastqFiles = list_files(file_path)
if trim == 'True':
    fastqFiles = Trimmomatic(trimmomatic, fastqFiles, phred)
print 'list file succeed'
print 'fastqFiles is: ', fastqFiles
#========  (2) align fastq files to host ================================
try:
    if aligner == 'gsnap':
        # check index
        if os.listdir(alignerDb) == []:
            gsnap_Db(ref_fa, alignerDb, gsnapDbName, gsnapAnnotation)
        map_files = gsnap(fastqFiles, alignerDb, gsnapDbName, gsnapAnnotation,
                          thread)  # [file.sam]
    else:
예제 #14
0
trimmomatic = param['trimmomatic']
trimmoAdapter = param['trimmoAdapter']

aligner = param['aligner']
annotation = param['annotation']
output_path = param['htseqOutPath']
htseqBatch = param['htseqBatch']
db_name = param['gsnapDbName']
gsnap_annotation = param['gsnapAnnotation']

Dict = param['symbolIDFile']
inputpath = file_path

#=========== (0) enter the directory ================
os.chdir(file_path)
Message(startMessage,email)
#=========== (1) reads files and trim ===============

fastqFiles = list_files(file_path)
print 'list file succeed'
if trim == 'True':
    try:
        trim_fastqFiles = Trimmomatic(trimmomatic,fastqFiles,phred,trimmoAdapter,batch=6)
        print 'trim succeed'
        print 'fastqFiles is: ',fastqFiles
        remove(fastqFiles)
    except:
        print 'trim failed'
        Message('trim failed',email)
        raise
else: