def alignment_blat_oldMaster(FASTA, reference, fileName, outDir): ''' Align a set of sequence into a reference with blat Input: 1. FASTA: Path to FASTA file with sequences to align 2. reference: Path to the reference genome in fasta format 3. fileName: output file will be named accordingly 4. outDir: Output directory Output: 1. SAM: Path to SAM file containing input sequences alignments or 'None' if alignment failed ''' ## Align the sequences into the reference PSL = outDir + '/' + fileName + '.psl' err = open(outDir + '/align.err', 'w') command = 'blat -stepSize=5 -repMatch=2253 -minScore=20 -minIdentity=0 -noHead -out=psl ' + reference + ' ' + FASTA + ' ' + PSL status = subprocess.call(command, stderr=err, shell=True) if status != 0: step = 'ALIGN' msg = 'Alignment failed' log.step(step, msg) return PSL
def alignment_minimap2(FASTA, index, fileName, processes, outDir): ''' Align a set of sequence into a reference with minimap2 Input: 1. FASTA: Path to FASTA file with sequences to align 2. index: Path to the the index of the reference in .mmi format (generated with minimap2) 3. fileName: output file will be named accordingly 4. processes: Number of processes used by minimap2 5. outDir: Output directory Output: 1. PAF: Path to PAF file containing input sequences alignments or 'None' if alignment failed ''' ## Align the sequences into the reference # Note, condider to use -Y to get soft clippings for supplementary alignments PAF = outDir + '/' + fileName + '.paf' err = open(outDir + '/align.err', 'w') command = 'minimap2 -t ' + str( processes) + ' ' + index + ' ' + FASTA + ' > ' + PAF status = subprocess.call(command, stderr=err, shell=True) if status != 0: step = 'ALIGN' msg = 'Local alignment failed' log.step(step, msg) return PAF
def alignment_bwa(FASTA, reference, fileName, processes, outDir): ''' Align a set of sequence into a reference with bwa mem Input: 1. FASTA: Path to FASTA file with sequences to align 2. reference: Path to the reference genome in fasta format (bwa mem index must be located in the same folder) 3. fileName: output file will be named accordingly 4. processes: Number of processes used by bwa mem 5. outDir: Output directory Output: 1. SAM: Path to SAM file containing input sequences alignments or 'None' if alignment failed ''' ## Align the sequences into the reference SAM = outDir + '/' + fileName + '.sam' err = open(outDir + '/align.err', 'w') command = 'bwa mem -Y -t ' + str( processes) + ' ' + reference + ' ' + FASTA + ' > ' + SAM status = subprocess.call(command, stderr=err, shell=True) if status != 0: step = 'ALIGN' msg = 'Alignment failed' log.step(step, msg) return SAM
def komplexityFilter(komplexityThreshold, inFasta, outFasta, outDir): ''' Filter fasta file using komplexity tool Input: 1. komplexityThreshold: Complexity threshold filter. 2. inFasta: input FASTA file name 3. outFasta: output FASTA file name 4. outDir: input AND output directory (it must be the same) Output: 1. allFastas: Filteres FASTA file complete path. ''' # Set input an output files allFastas_all = outDir + '/' + inFasta allFastas = outDir + '/' + outFasta logDir = outDir + '/Logs' unix.mkdir(logDir) command = 'kz --filter --threshold ' + str( komplexityThreshold ) + ' --fasta < ' + allFastas_all + ' > ' + allFastas err = open(logDir + '/komplexity.err', 'w') status = subprocess.call(command, stderr=err, shell=True) if status != 0: step = 'KOMPLEXITY' msg = 'Komplexity filter failed. PID: ' + str(os.getpid()) log.step(step, msg) return allFastas
def BAM2BED(BAM, outDir): ''' Convert BAM file into BED using bedtools Input: 1. BAM: Path to BAM file 2. outDir: Output directory Output: 1. BED: Path to BED file ''' ## 0. Create logs directory logDir = outDir + '/Logs' unix.mkdir(logDir) ## 1. Convert BAM into BED BED_path = outDir + '/alignments.bed' err = open(logDir + '/BAM2BED.err', 'w') command = 'bedtools bamtobed -split -i ' + BAM + ' > ' + BED_path status = subprocess.call(command, stderr=err, shell=True) if status != 0: step = 'BAM2BED' msg = 'BAM to BED conversion failed' log.step(step, msg) ## 2. Add header to BED file header = "#ref \t beg \t end \t name \t score \t strand \n" with open(BED_path, 'r') as original: data = original.read() with open(BED_path, 'w') as modified: modified.write(header + data) return BED_path
def aligmentMaxNbMatches(FASTA_file, db, PAF_file, outDir): ''' ''' # DESILENCIAAAAAR!!! # TODO: append en el error! err = open(outDir + '/identifyMate.err', 'w') command = 'minimap2 ' + db + ' ' + FASTA_file + ' > ' + PAF_file status = subprocess.call(command, stderr=err, shell=True) if status != 0: step = 'IDENTIFY MATE SEQ' msg = 'Identify mate sequence failed' log.step(step, msg) # If PAF file is not empty if not os.stat(PAF_file).st_size == 0: PAFObj = formats.PAF() PAFObj.read(PAF_file) # Pick the identity of the aligment with highest number of matches aligmentMaxNbMatches = PAFObj.sortNbMatches()[0] else: aligmentMaxNbMatches = None return aligmentMaxNbMatches
def targeted_alignment_minimap2(FASTA, targetInterval, reference, outDir, outFormat): # NOTE 2020: In 2020: # def targeted_alignment_minimap2(FASTA, targetInterval, reference, outDir): ''' Align a set of sequences into a reference target region. Useful for doing local realignment of reads around SV breakpoints. Much faster than whole genome realignment Input: 1. FASTA: Path to FASTA file with sequences to align 2. targetInterval: Reference genome interval where sequences will be aligned. The interval must be provided as chr:beg-end. 3. reference: Path to the reference sequences in fasta format. An index of the reference generated with samtools faidx must be located in the same directory 4. outDir: Output directory 5. outFormat: BAM or SAM Output: 1. BAM: Path to sorted BAM file containing input sequences alignments or 'None' if realignment failed ''' ## 0. Create logs directory logDir = outDir + '/Logs' unix.mkdir(logDir) ## 1. Extract the reference target region prior alignment target = outDir + '/target.fa' err = open(logDir + '/target.err', 'w') command = 'samtools faidx ' + reference + ' ' + targetInterval + ' > ' + target status = subprocess.call(command, stderr=err, shell=True) if status != 0: step = 'TARGET' msg = 'Extraction of reference target region failed' log.step(step, msg) return None ## 2. Align the sequences into the target region # Use -Y to get soft clippings for supplementary alignments SAM = outDir + '/alignments.sam' err = open(logDir + '/align.err', 'w') command = 'minimap2 -Y -a ' + target + ' ' + FASTA + ' > ' + SAM status = subprocess.call(command, stderr=err, shell=True) if status != 0: step = 'ALIGN' msg = 'Local alignment failed' log.step(step, msg) return None # NOTE 2020: In 2020 these 2 lines were deleted: if outFormat == "SAM": return SAM ## 3. Convert SAM to sorted BAM BAM = bamtools.SAM2BAM(SAM, outDir) ## 4. Do cleanup unix.rm([target, SAM]) return BAM
def index_bwa(fastaPath, outDir): ''' Wrapper to generate BWA index for fasta file ''' err = open(outDir + '/index.err', 'w') command = 'bwa index ' + fastaPath status = subprocess.call(command, stderr=err, shell=True) if status != 0: step = 'INDEX' msg = 'BWA indexing failed' log.step(step, msg)
def run(self): log.print_action_header("up") log.step("Parsing dependencies") # Read all dependencies and validate all_deps = self.__parse_dependencies() # Check to see which differ log.step("Diffing dependencies to current") deps_to_sync = self.__diff_to_current(all_deps) # Sync dependencies results = [] if len(deps_to_sync) > 0: log.step( "Syncing {count} dependencies".format(count=len(deps_to_sync))) for dep in deps_to_sync: self.download_and_apply(dep) # Output results log.step("Collating results") print("") self.__output_results(results)
def index_minimap2(fastaPath, fileName, outDir): ''' Wrapper to generate minimap2 index for fasta file ''' indexPath = outDir + '/' + fileName + '.mmi' err = open(outDir + '/index.err', 'w') command = 'minimap2 -k 11 -w 1 -d ' + indexPath + ' ' + fastaPath status = subprocess.call(command, stderr=err, shell=True) if status != 0: step = 'INDEX' msg = 'minimap2 indexing failed' log.step(step, msg) return indexPath
def getPAFAlign(FASTA_file, indexDb, outDir): # Alineo el fasta consenso # TODO ponerlo bien PAF_file = FASTA_file.replace(".fa", "_alignments.paf") #err = open(logDir + '/align.err', 'w') command = 'minimap2 ' + indexDb + ' ' + FASTA_file + ' > ' + PAF_file err = open(outDir + '/minimap2.err', 'w') status = subprocess.call(command, stderr=err, shell=True) if status != 0: step = 'ALIGN-INSERT' msg = 'minimap2 alignment failed' log.step(step, msg) return PAF_file
def rm(paths): ''' Delete set of files/directories. Directories are deleted recursively Input: 1. files: list containing file/directory paths to be deleted ''' for path in paths: exist = os.path.exists(path) # Only attempt to delete file if it does exists if exist: try: command = ['rm', '-r', path] subprocess.call(command) except OSError: step = 'ERROR' msg = "Deletion of the file %s failed" % filePath log.step(step, msg)
def mkdir(path): ''' Create directory Note: improve function to be able to create lists of directories Input: 1. path: directory to be created ''' exist = os.path.isdir(path) # Only attempt to create directory if it does not exists if not exist: try: command = ['mkdir', '-p', path] subprocess.call(command) except OSError: step = 'ERROR' msg = "Creation of the directory %s failed" % path log.step(step, msg)
def alignment_blat(FASTA, reference, args, fileName, outDir): ''' Align a set of sequence into a reference with blat Input: 1. FASTA: Path to FASTA file with sequences to align 2. reference: Path to the reference genome in fasta format (bwa mem index must be located in the same folder) 3. args: dictionary containing blat arguments 4. fileName: output file will be named accordingly 5. outDir: Output directory Output: 1. SAM: Path to SAM file containing input sequences alignments or 'None' if alignment failed ''' ## Align the sequences into the reference PSL = outDir + '/' + fileName + '.psl' err = open(outDir + '/align.err', 'w') # Set blat arguments blatArgs = [] if 'stepSize' in args.keys(): blatArgs.append('-stepSize=' + str(args['stepSize'])) if 'tileSize' in args.keys(): blatArgs.append('-tileSize=' + str(args['tileSize'])) command = 'blat ' + ' '.join( blatArgs ) + ' -repMatch=2253 -minScore=20 -minIdentity=0 -noHead -out=psl ' + reference + ' ' + FASTA + ' ' + PSL status = subprocess.call(command, stderr=err, shell=True) if status != 0: step = 'ALIGN' msg = 'Alignment failed' log.step(step, msg) return PSL
def create_targeted_fasta(targetIntervalList, reference, outDir): ''' Extract regions of interest from a fasta file. Input: 1. targetIntervalList: Reference genome list of intervals to be extracted. The intervals must be provided as chr:beg-end. 2. reference: Path to fasta file. An index of the reference generated with samtools faidx must be located in the same directory 3. outDir: Output directory Output: 1. target: Path to fasta file with sequences extarcted from intervals. ''' ## 0. Create logs directory logDir = outDir + '/Logs' unix.mkdir(logDir) ## 1. Extract the reference target regions target = outDir + '/targetRegions.fa' err = open(logDir + '/target.err', 'w') targetRegionsPath = outDir + '/targetRegions.txt' targetRegions = open(targetRegionsPath, 'w') for targetInterval in targetIntervalList: targetRegions.write(targetInterval + '\n') targetRegions.close() command = 'samtools faidx ' + reference + ' -r ' + targetRegionsPath + ' -o ' + target status = subprocess.call(command, stderr=err, shell=True) if status != 0: step = 'TARGET' msg = 'Extraction of reference target region failed' log.step(step, msg) return None # TODO: remove targetRegionsPath file return target
def SAM2BAM(SAM, outDir): ''' Convert SAM file into sorted BAM and make BAM index Input: 1. SAM: File containing alignments in SAM format Output: 1. BAM_sorted: Sorted and indexed BAM file. BAM index located in the same directory with the extension '.bai' ''' ## 0. Create logs directory logDir = outDir + '/Logs' unix.mkdir(logDir) ## 1. Convert SAM into BAM BAM = outDir + '/alignments.bam' err = open(logDir + '/SAM2BAM.err', 'w') command = 'samtools view -Sb ' + SAM + ' > ' + BAM status = subprocess.call(command, stderr=err, shell=True) if status != 0: step = 'SAM2BAM' msg = 'SAM to BAM conversion failed' log.step(step, msg) ## 2. Sort bam BAM_sorted = outDir + '/alignments.sorted.bam' err = open(logDir + '/sort.err', 'w') command = 'samtools sort ' + BAM + ' > ' + BAM_sorted status = subprocess.call(command, stderr=err, shell=True) if status != 0: step = 'SORT' msg = 'BAM sorting failed' log.step(step, msg) ## 3. Index bam BAM_index = outDir + '/alignments.sorted.bam.bai' err = open(logDir + '/index.err', 'w') command = 'samtools index ' + BAM_sorted + ' > ' + BAM_index status = subprocess.call(command, stderr=err, shell=True) if status != 0: step = 'INDEX' msg = 'BAM indexing failed' log.step(step, msg) return BAM_sorted
if ssh_key is None: ssh_key = digitalocean.SSHKey(token=token) ssh_key.name = key_name ssh_key.public_key = public_key ssh_key.create() droplet_name = env('AZK_MID', 'azk-deploy') droplet_name = env('BOX_NAME', droplet_name) droplet_region = env('BOX_REGION', 'nyc3') droplet_image = env('BOX_IMAGE', 'ubuntu-14-04-x64') droplet_size = env('BOX_SIZE', '1gb') droplet_backup = True if env('BOX_BACKUP', 'false') == 'true' else None droplet_private_networking = True if env('BOX_PRIVATE_NETWORKING', 'false') == 'true' else None log.step('Logging into DigitalOcean') manager = digitalocean.Manager(token=token) log.step_done() log.step('Getting existing droplets') droplets = manager.get_all_droplets() log.step_done() droplet = None for a_droplet in droplets: if a_droplet.name == droplet_name: droplet = a_droplet break if not droplet is None and ( droplet.region['slug'] != droplet_region or
ssh_key = digitalocean.SSHKey(token=token) ssh_key.name = key_name ssh_key.public_key = public_key ssh_key.create() droplet_name = env('AZK_MID', 'azk-deploy') droplet_name = env('BOX_NAME', droplet_name) droplet_region = env('BOX_REGION', 'nyc3') droplet_image = env('BOX_IMAGE', 'ubuntu-14-04-x64') droplet_size = env('BOX_SIZE', '1gb') droplet_backup = True if env('BOX_BACKUP', 'false') == 'true' else None droplet_private_networking = True if env('BOX_PRIVATE_NETWORKING', 'false') == 'true' else None log.step('Logging into DigitalOcean') manager = digitalocean.Manager(token=token) log.step_done() log.step('Getting existing droplets') droplets = manager.get_all_droplets() log.step_done() droplet = None for a_droplet in droplets: if a_droplet.name == droplet_name: droplet = a_droplet break if not droplet is None and ( droplet.region['slug'] != droplet_region