if __name__ == "__main__": parser = argparse.ArgumentParser() parser.add_argument('-r', '--readDir', help='directory containing fastq files', required=True) parser.add_argument('-a', '--alignmentParDir', help='parent directory where directory for alignments for this data set will be created', default="/srv/gsfs0/projects/salzman/Linda/alignments") parser.add_argument('-t', '--taskDir', help='name of directory under alignmentParDir to output task file', default="taskIdFiles") parser.add_argument('-d', '--dataSet', required=True, help='name of directory under alignmentParDir that all alignment files will be written to') parser.add_argument('-v', '--verbose', help='print extra debugging info', action='store_true') parser.add_argument('-u', '--unalignedMode', help='pass this flag if we are aligning the unaligned reads from previous pipeline run', action='store_true') args = parser.parse_args() try: ### create output directories if they don't exist, and the alignment subdirectories utils_os.createDirectory("/".join([args.alignmentParDir, args.taskDir])) utils_os.createDirectory("/".join([args.alignmentParDir, args.dataSet])) utils_os.createDirectory("/".join([args.alignmentParDir, args.dataSet, "orig"])) utils_os.createDirectory("/".join([args.alignmentParDir, args.dataSet, "orig", "denovo"])) utils_os.createDirectory("/".join([args.alignmentParDir, args.dataSet, "orig", "still_unaligned"])) utils_os.createDirectory("/".join([args.alignmentParDir, args.dataSet, "orig", "genome"])) utils_os.createDirectory("/".join([args.alignmentParDir, args.dataSet, "orig", "junction"])) utils_os.createDirectory("/".join([args.alignmentParDir, args.dataSet, "orig", "ribo"])) utils_os.createDirectory("/".join([args.alignmentParDir, args.dataSet, "orig", "reg"])) utils_os.createDirectory("/".join([args.alignmentParDir, args.dataSet, "orig", "ids"])) utils_os.createDirectory("/".join([args.alignmentParDir, args.dataSet, "orig", "ids", "denovo"])) utils_os.createDirectory("/".join([args.alignmentParDir, args.dataSet, "orig", "ids", "genome"])) utils_os.createDirectory("/".join([args.alignmentParDir, args.dataSet, "orig", "ids", "junction"])) utils_os.createDirectory("/".join([args.alignmentParDir, args.dataSet, "orig", "ids", "ribo"])) utils_os.createDirectory("/".join([args.alignmentParDir, args.dataSet, "orig", "ids", "reg"]))
action='store_true') args = parser.parse_args() if args.verbose: print "parentDir:", args.parentDir print "sampleId:", args.sampleId print "outDirName:", args.outDirName print "style:", args.fastqIdStyle print "a1:", args.aScore1 print "a2:", args.aScore2 print "overhang:", args.overhang print "id dir suffix:", args.junctionIdDirSuffix print "unaligned:", args.unalignedMode # make output dirs if they don't exist utils_os.createDirectory("/".join([args.parentDir, args.outDirName])) utils_os.createDirectory("/".join( [args.parentDir, args.outDirName, "reports"])) # these are the reports using the naive method utils_os.createDirectory("/".join([ args.parentDir, args.outDirName, "glmReports" ])) # GLM will be run later and those reports will be stored here utils_os.createDirectory("/".join([ args.parentDir, args.outDirName, "glmModels" ])) # GLM will be run later and those models will be stored here utils_os.createDirectory("/".join([ args.parentDir, args.outDirName, "ids" ])) # txt files of read ids assigned to circular or linear category # just doing read1s if args.sampleId.endswith("1"):
required=True, help="looking for PE > X base pairs apart. Linda's default window is 100K." ) args = parser.parse_args() if args.origDir[-1] != "/": inpath = args.origDir + "/" else: inpath = args.origDir if args.outputDir[-1] != "/": outpath = args.outputDir + "/DistantPEFiles/" else: outpath = args.outputDir + "DistantPEFiles/" utils_os.createDirectory(outpath) UserBPdistance = int(args.UserBPdistance) # change the input path to the path where your file exists os.chdir(inpath) genomefiles = sorted(glob.glob(inpath + "genome/sorted*" + args.stem + "*.sam")) regfiles = sorted(glob.glob(inpath + "reg/sorted*" + args.stem + "*.sam")) print genomefiles print regfiles # print SamFiles #opening paired files
parser = argparse.ArgumentParser() parser.add_argument('-f', '--fastaFile', required=True, help='path to fasta file with chromosome sequences') parser.add_argument('-a', '--annotationFile', required=True, help='path to gff file with exon annotations') parser.add_argument('-o', '--outDir', help='directory to output files, will be created if it does not exist', default='output') parser.add_argument('-e', '--exonOutDir', help='directory to output exon pickle files, will be created within outDir if it does not exist', default='exons') parser.add_argument('-g', '--geneOutDir', help='directory to output gene pickle files, will be created within outDir if it does not exist', default='genes') parser.add_argument('-r', '--recOutDir', help='directory to output seq record pickle files, will be created within outDir if it does not exist', default='records') parser.add_argument('-v', '--verbose', help='print info about data obtained', action='store_true') args = parser.parse_args() exonOutFullPath = '/'.join([args.outDir, args.exonOutDir]) geneOutFullPath = '/'.join([args.outDir, args.geneOutDir]) recOutFullPath = '/'.join([args.outDir, args.recOutDir]) # create output directories if necessary utils_os.createDirectory(args.outDir) utils_os.createDirectory(exonOutFullPath) utils_os.createDirectory(geneOutFullPath) utils_os.createDirectory(recOutFullPath) # read in the sequences f_handle = open(args.fastaFile, "rU") f_dict = SeqIO.to_dict(SeqIO.parse(f_handle, "fasta")) f_handle.close() # only want exons for now. unfortunately can't limit by strand so have to do that after the fact # we can speed things up a bit by limiting to just the chromosomes we have sequences for limit_info = dict( gff_id = f_dict.keys(), gff_type = ["exon"])
default="swap") parser.add_argument('-v', '--verbose', help='print extra debugging info', action='store_true') args = parser.parse_args() try: swappedDataSet = args.dataSet + "Swapped" origSam = "/".join([args.alignmentParDir, args.dataSet, "orig"]) swappedSam = "/".join([args.alignmentParDir, swappedDataSet, "orig"]) if args.mode == "swap": ### create output directories if they don't exist, and the alignment subdirectories ### these are all that are created by writeTaskIdFiles.py for original run, all other directories are output as usual during the run in analysis mode utils_os.createDirectory("/".join( [args.alignmentParDir, swappedDataSet])) utils_os.createDirectory("/".join( [args.alignmentParDir, swappedDataSet, "orig"])) utils_os.createDirectory("/".join( [args.alignmentParDir, swappedDataSet, "orig", "denovo"])) utils_os.createDirectory("/".join([ args.alignmentParDir, swappedDataSet, "orig", "still_unaligned" ])) utils_os.createDirectory("/".join( [args.alignmentParDir, swappedDataSet, "orig", "genome"])) utils_os.createDirectory("/".join( [args.alignmentParDir, swappedDataSet, "orig", "junction"])) utils_os.createDirectory("/".join( [args.alignmentParDir, swappedDataSet, "orig", "ribo"])) utils_os.createDirectory("/".join(
parser.add_argument("-w", "--window", required=True, help = "size = w of window where if read occurs at X, then window starts at X-w and ends at X+w") parser.add_argument("-n","--UserBPdistance", required = True, help = "looking for PE > X base pairs apart. Linda's default window is 100K.") args=parser.parse_args() if args.origDir[-1] != "/": inpath = args.origDir + "/" else: inpath = args.origDir if args.outputDir[-1] != "/": outpath = args.outputDir + "/DistantPEFiles/" else: outpath = args.outputDir + "DistantPEFiles/" utils_os.createDirectory(outpath) UserBPdistance = int(args.UserBPdistance) # change the input path to the path where your file exists os.chdir(inpath) genomefiles = sorted(glob.glob(inpath + "genome/sorted*" + args.stem + "*.sam")) regfiles = sorted(glob.glob(inpath + "reg/sorted*" + args.stem + "*.sam")) print genomefiles print regfiles
handle.write("swapped_circOrLinear\tswapped_decoyOrAnom\tswapped_unmapped\tswapped_pval\ttotal_reads\n") for j in junctions: handle.write(str(junctions[j]) + "\n") handle.close() if __name__ == "__main__": parser = argparse.ArgumentParser() parser.add_argument('-a', '--dirA', help='directory containing original pipeline reports (parent of reports, ids, etc)', required=True) parser.add_argument('-b', '--dirB', help='directory containing swapped pipeline reports (parent of reports, ids, etc directories)', required=True) parser.add_argument('-q', '--fastqIdStyle', help='type of read ids used', required=True, choices=['appended', 'complete']) parser.add_argument('-v', '--verbose', help='print extra debugging info', action='store_true') args = parser.parse_args() utils_os.createDirectory("/".join([args.dirA, "combinedReports"])) for f in os.listdir("/".join([args.dirA, "reports"])): reportFileName = os.path.basename(f) if args.verbose: print reportFileName if patt_filename.search(reportFileName): idFileName = reportFileName.replace("report", "_output") # could be denovo or annotated report file, all handled the same if args.verbose: print reportFileName print idFileName junctions = {} combineResults(reportFileName, idFileName)
print len(allJunctions) if __name__ == "__main__": parser = argparse.ArgumentParser() parser.add_argument('-w', '--window', help='sliding window size to create junctions', default=100000, type=int) parser.add_argument('-e', '--exonDir', help='directory containing exon pickle files', default='output/exons') parser.add_argument('-s', '--singleFile', help='path to single exon file that should be parsed for junctions') parser.add_argument('-r', '--recordDir', help='directory containing seq record pickle files', default='output/records') parser.add_argument('-f', '--fastaDir', help='directory to output junction fasta files, will be created if does not exist', default='output/fasta') parser.add_argument('-v', '--verbose', help='print extra debugging info', action='store_true') args = parser.parse_args() # create fastaDir if it doesn't exist utils_os.createDirectory(args.fastaDir) # only run for a single file if args.singleFile: if args.verbose: print "running for single file", args.singleFile exonObj = os.path.basename(args.singleFile) fileId = utils_os.getFileId(patt_exonfilename, 1, args.singleFile) # usually something like chr# which is in the name of each created file createJunctions(fileId, exonObj) else: if args.verbose: print "running for directory", args.exonDir # or loop through files in exonDir to create junction file for each for exonObj in os.listdir(args.exonDir): if patt_exonfile.search(exonObj): # only parse if this is an exon pickled file fileId = utils_os.getFileId(patt_exonfilename, 1, exonObj) # usually something like chr# which is in the name of each created file
parser.add_argument('-v', '--verbose', help='print extra debugging info', action='store_true') args = parser.parse_args() if args.verbose: print "parentDir:", args.parentDir print "sampleId:", args.sampleId print "outDirName:", args.outDirName print "style:", args.fastqIdStyle print "a1:", args.aScore1 print "a2:", args.aScore2 print "overhang:", args.overhang print "id dir suffix:", args.junctionIdDirSuffix print "unaligned:", args.unalignedMode # make output dirs if they don't exist utils_os.createDirectory("/".join([args.parentDir, args.outDirName])) utils_os.createDirectory("/".join([args.parentDir, args.outDirName, "reports"])) # these are the reports using the naive method utils_os.createDirectory("/".join([args.parentDir, args.outDirName, "glmReports"])) # GLM will be run later and those reports will be stored here utils_os.createDirectory("/".join([args.parentDir, args.outDirName, "glmModels"])) # GLM will be run later and those models will be stored here utils_os.createDirectory("/".join([args.parentDir, args.outDirName, "ids"])) # txt files of read ids assigned to circular or linear category # just doing read1s if args.sampleId.endswith("1"): try: # populate the ignoreIds, regIds, nonRegIds for this file and also print out regIds and nonRegIds to juncNonGR file ignoreIds = {} # ribo and genome aligned regIds = {} # regular junction overlapped and not ribo or genome aligned (aligned to reg-only index) nonRegIds = {} # junction overlapped and not ribo or genome aligned or regular-junction aligned # we treat denovo reads as the equivalent of the junction reads in unalignedMode if args.unalignedMode:
parser.add_argument('-w', '--window', help='sliding window size to create junctions', default=1000000, type=int) parser.add_argument('-e', '--exonDir', help='directory containing exon pickle files', default='output/exons') parser.add_argument('-s', '--singleFile', help='path to single exon file that should be parsed for junctions') parser.add_argument('-r', '--recordDir', help='directory containing seq record pickle files', default='output/records') parser.add_argument('-f', '--fastaDir', help='directory to output junction fasta files, will be created if does not exist', default='output/fasta') parser.add_argument('-n1', '--name1', help='name of field in gtf to use for gene names', default='gene_name') parser.add_argument('-n2', '--name2', help='name of field in gtf to use for gene names if n1 does not exist', default='gene_id') parser.add_argument('-v', '--verbose', help='print extra debugging info', action='store_true') args = parser.parse_args() # create fastaDir if it doesn't exist utils_os.createDirectory(args.fastaDir) # only run for a single file if args.singleFile: if args.verbose: print "running for single file", args.singleFile exonObj = os.path.basename(args.singleFile) fileId = utils_os.getFileId(patt_exonfilename, 1, args.singleFile) # usually something like chr# which is in the name of each created file createJunctions(fileId, exonObj) else: if args.verbose: print "running for directory", args.exonDir # or loop through files in exonDir to create junction file for each for exonObj in os.listdir(args.exonDir): if patt_exonfile.search(exonObj): # only parse if this is an exon pickled file
'--dirB', help= 'directory containing swapped pipeline reports (parent of glmReports, ids, etc directories)', required=True) parser.add_argument('-q', '--fastqIdStyle', help='type of read ids used', required=True, choices=['appended', 'complete']) parser.add_argument('-v', '--verbose', help='print extra debugging info', action='store_true') args = parser.parse_args() utils_os.createDirectory("/".join([args.dirA, "combinedReports"])) for f in os.listdir("/".join([args.dirA, "glmReports"])): reportFileName = os.path.basename(f) if args.verbose: print reportFileName if patt_GLMfilename.search(reportFileName): idFileName = reportFileName.replace( "circJuncProbs", "output").replace("linearJuncProbs", "output") # could be linear or circular file if args.verbose: print reportFileName print idFileName
choices=['all', 'within', 'between'], default='all') parser.add_argument('-p', '--postpend', help='text to add to end of output file names', required=True) parser.add_argument('-v', '--verbose', help='print info about data obtained', action='store_true') args = parser.parse_args() if args.fastaDir and args.singleFile: sys.exit("Only 1 of -d and -s can be specified") # create output directory if it does not exist utils_os.createDirectory(args.outDir) # fasta junction ids look like chr10|TTC40:134751179|MYC1:134722640|reg|- # where we are interested in TTC40, MYC1, and reg id_patt = re.compile(".+?\|(.+?):.+?\|(.+?):.+?\|(.+?)\|.*") if args.fastaDir: # loop through files directory for f in os.listdir(args.fastaDir): if f.endswith(".fa"): # only parse if this is a fasta file writeLimitedFasta('/'.join([args.fastaDir, f])) elif args.singleFile: writeLimitedFasta(args.singleFile)
'--recOutDir', help= 'directory to output seq record pickle files, will be created within outDir if it does not exist', default='records') parser.add_argument('-v', '--verbose', help='print info about data obtained', action='store_true') args = parser.parse_args() exonOutFullPath = '/'.join([args.outDir, args.exonOutDir]) geneOutFullPath = '/'.join([args.outDir, args.geneOutDir]) recOutFullPath = '/'.join([args.outDir, args.recOutDir]) # create output directories if necessary utils_os.createDirectory(args.outDir) utils_os.createDirectory(exonOutFullPath) utils_os.createDirectory(geneOutFullPath) utils_os.createDirectory(recOutFullPath) # read in the sequences f_handle = open(args.fastaFile, "rU") f_dict = SeqIO.to_dict(SeqIO.parse(f_handle, "fasta")) f_handle.close() # only want exons for now. unfortunately can't limit by strand so have to do that after the fact # we can speed things up a bit by limiting to just the chromosomes we have sequences for limit_info = dict(gff_id=f_dict.keys(), gff_type=["exon"]) if args.verbose: print "data we are searching for in gff: " + str(limit_info)
if __name__ == "__main__": parser = argparse.ArgumentParser() parser.add_argument('-d', '--fastaDir', help='directory containing fasta files to limit. Either -d or -s required.') parser.add_argument('-s', '--singleFile', help='fasta file to limit. Either -d or -s required.') parser.add_argument('-o', '--outDir', help='directory to output files. Will be created if it does not exist.', required=True) parser.add_argument('-t', '--juncType', help='type of junction to limit to', choices=['reg', 'rev', 'dup'], required=True) parser.add_argument('-b', '--bounds', help='type of junction to limit to', choices=['all', 'within', 'between'], default='all') parser.add_argument('-p', '--postpend', help='text to add to end of output file names', required=True) parser.add_argument('-v', '--verbose', help='print info about data obtained', action='store_true') args = parser.parse_args() if args.fastaDir and args.singleFile: sys.exit("Only 1 of -d and -s can be specified") # create output directory if it does not exist utils_os.createDirectory(args.outDir) # fasta junction ids look like chr10|TTC40:134751179|MYC1:134722640|reg|- # where we are interested in TTC40, MYC1, and reg id_patt = re.compile(".+?\|(.+?):.+?\|(.+?):.+?\|(.+?)\|.*") if args.fastaDir: # loop through files directory for f in os.listdir(args.fastaDir): if f.endswith(".fa"): # only parse if this is a fasta file writeLimitedFasta('/'.join([args.fastaDir,f])) elif args.singleFile: writeLimitedFasta(args.singleFile)