Beispiel #1
0
if __name__  == "__main__":
    
    parser = argparse.ArgumentParser()
    parser.add_argument('-r', '--readDir', help='directory containing fastq files', required=True)
    parser.add_argument('-a', '--alignmentParDir', help='parent directory where directory for alignments for this data set will be created', default="/srv/gsfs0/projects/salzman/Linda/alignments")
    parser.add_argument('-t', '--taskDir', help='name of directory under alignmentParDir to output task file', default="taskIdFiles")
    parser.add_argument('-d', '--dataSet', required=True,
                        help='name of directory under alignmentParDir that all alignment files will be written to')
    parser.add_argument('-v', '--verbose', help='print extra debugging info', action='store_true')
    parser.add_argument('-u', '--unalignedMode', help='pass this flag if we are aligning the unaligned reads from previous pipeline run', action='store_true')
    args = parser.parse_args()
    
    try:
        ### create output directories if they don't exist, and the alignment subdirectories
        utils_os.createDirectory("/".join([args.alignmentParDir, args.taskDir]))
        utils_os.createDirectory("/".join([args.alignmentParDir, args.dataSet]))

        utils_os.createDirectory("/".join([args.alignmentParDir, args.dataSet, "orig"]))
        utils_os.createDirectory("/".join([args.alignmentParDir, args.dataSet, "orig", "denovo"]))
        utils_os.createDirectory("/".join([args.alignmentParDir, args.dataSet, "orig", "still_unaligned"]))
        utils_os.createDirectory("/".join([args.alignmentParDir, args.dataSet, "orig", "genome"]))
        utils_os.createDirectory("/".join([args.alignmentParDir, args.dataSet, "orig", "junction"]))
        utils_os.createDirectory("/".join([args.alignmentParDir, args.dataSet, "orig", "ribo"]))
        utils_os.createDirectory("/".join([args.alignmentParDir, args.dataSet, "orig", "reg"]))
        utils_os.createDirectory("/".join([args.alignmentParDir, args.dataSet, "orig", "ids"]))
        utils_os.createDirectory("/".join([args.alignmentParDir, args.dataSet, "orig", "ids", "denovo"]))
        utils_os.createDirectory("/".join([args.alignmentParDir, args.dataSet, "orig", "ids", "genome"]))
        utils_os.createDirectory("/".join([args.alignmentParDir, args.dataSet, "orig", "ids", "junction"]))
        utils_os.createDirectory("/".join([args.alignmentParDir, args.dataSet, "orig", "ids", "ribo"]))
        utils_os.createDirectory("/".join([args.alignmentParDir, args.dataSet, "orig", "ids", "reg"]))
Beispiel #2
0
                        action='store_true')
    args = parser.parse_args()

    if args.verbose:
        print "parentDir:", args.parentDir
        print "sampleId:", args.sampleId
        print "outDirName:", args.outDirName
        print "style:", args.fastqIdStyle
        print "a1:", args.aScore1
        print "a2:", args.aScore2
        print "overhang:", args.overhang
        print "id dir suffix:", args.junctionIdDirSuffix
        print "unaligned:", args.unalignedMode

    # make output dirs if they don't exist
    utils_os.createDirectory("/".join([args.parentDir, args.outDirName]))
    utils_os.createDirectory("/".join(
        [args.parentDir, args.outDirName,
         "reports"]))  # these are the reports using the naive method
    utils_os.createDirectory("/".join([
        args.parentDir, args.outDirName, "glmReports"
    ]))  # GLM will be run later and those reports will be stored here
    utils_os.createDirectory("/".join([
        args.parentDir, args.outDirName, "glmModels"
    ]))  # GLM will be run later and those models will be stored here
    utils_os.createDirectory("/".join([
        args.parentDir, args.outDirName, "ids"
    ]))  # txt files of read ids assigned to circular or linear category
    # just doing read1s
    if args.sampleId.endswith("1"):
Beispiel #3
0
    required=True,
    help="looking for PE > X base pairs apart. Linda's default window is 100K."
)
args = parser.parse_args()

if args.origDir[-1] != "/":
    inpath = args.origDir + "/"
else:
    inpath = args.origDir

if args.outputDir[-1] != "/":
    outpath = args.outputDir + "/DistantPEFiles/"
else:
    outpath = args.outputDir + "DistantPEFiles/"

utils_os.createDirectory(outpath)

UserBPdistance = int(args.UserBPdistance)

# change the input path to the path where your file exists
os.chdir(inpath)

genomefiles = sorted(glob.glob(inpath + "genome/sorted*" + args.stem +
                               "*.sam"))
regfiles = sorted(glob.glob(inpath + "reg/sorted*" + args.stem + "*.sam"))

print genomefiles
print regfiles

# print SamFiles
#opening paired files
Beispiel #4
0
 parser = argparse.ArgumentParser()
 parser.add_argument('-f', '--fastaFile', required=True, help='path to fasta file with chromosome sequences')
 parser.add_argument('-a', '--annotationFile', required=True, help='path to gff file with exon annotations')
 parser.add_argument('-o', '--outDir', help='directory to output files, will be created if it does not exist', default='output')
 parser.add_argument('-e', '--exonOutDir', help='directory to output exon pickle files, will be created within outDir if it does not exist', default='exons')
 parser.add_argument('-g', '--geneOutDir', help='directory to output gene pickle files, will be created within outDir if it does not exist', default='genes')
 parser.add_argument('-r', '--recOutDir', help='directory to output seq record pickle files, will be created within outDir if it does not exist', default='records')
 parser.add_argument('-v', '--verbose', help='print info about data obtained', action='store_true')
 args = parser.parse_args()
 
 exonOutFullPath = '/'.join([args.outDir, args.exonOutDir])
 geneOutFullPath = '/'.join([args.outDir, args.geneOutDir])
 recOutFullPath = '/'.join([args.outDir, args.recOutDir])
 
 # create output directories if necessary
 utils_os.createDirectory(args.outDir)
 utils_os.createDirectory(exonOutFullPath)
 utils_os.createDirectory(geneOutFullPath)
 utils_os.createDirectory(recOutFullPath)
 
 # read in the sequences 
 f_handle = open(args.fastaFile, "rU")
 f_dict = SeqIO.to_dict(SeqIO.parse(f_handle, "fasta"))
 f_handle.close()
 
 # only want exons for now. unfortunately can't limit by strand so have to do that after the fact
 # we can speed things up a bit by limiting to just the chromosomes we have sequences for
 limit_info = dict(
     gff_id = f_dict.keys(),
     gff_type = ["exon"])
 
        default="swap")
    parser.add_argument('-v',
                        '--verbose',
                        help='print extra debugging info',
                        action='store_true')
    args = parser.parse_args()

    try:
        swappedDataSet = args.dataSet + "Swapped"
        origSam = "/".join([args.alignmentParDir, args.dataSet, "orig"])
        swappedSam = "/".join([args.alignmentParDir, swappedDataSet, "orig"])

        if args.mode == "swap":
            ### create output directories if they don't exist, and the alignment subdirectories
            ### these are all that are created by writeTaskIdFiles.py for original run, all other directories are output as usual during the run in analysis mode
            utils_os.createDirectory("/".join(
                [args.alignmentParDir, swappedDataSet]))

            utils_os.createDirectory("/".join(
                [args.alignmentParDir, swappedDataSet, "orig"]))
            utils_os.createDirectory("/".join(
                [args.alignmentParDir, swappedDataSet, "orig", "denovo"]))
            utils_os.createDirectory("/".join([
                args.alignmentParDir, swappedDataSet, "orig", "still_unaligned"
            ]))
            utils_os.createDirectory("/".join(
                [args.alignmentParDir, swappedDataSet, "orig", "genome"]))
            utils_os.createDirectory("/".join(
                [args.alignmentParDir, swappedDataSet, "orig", "junction"]))
            utils_os.createDirectory("/".join(
                [args.alignmentParDir, swappedDataSet, "orig", "ribo"]))
            utils_os.createDirectory("/".join(
Beispiel #6
0
parser.add_argument("-w", "--window", required=True, help = "size = w of window where if read occurs at X, then window starts at X-w and ends at X+w")
parser.add_argument("-n","--UserBPdistance", required = True, help = "looking for PE > X base pairs apart. Linda's default window is 100K.")
args=parser.parse_args()


if args.origDir[-1] != "/":
    inpath = args.origDir + "/"
else:
    inpath = args.origDir

if args.outputDir[-1] != "/":
    outpath = args.outputDir + "/DistantPEFiles/"
else:
    outpath = args.outputDir + "DistantPEFiles/"
    
utils_os.createDirectory(outpath)

UserBPdistance = int(args.UserBPdistance)



# change the input path to the path where your file exists
os.chdir(inpath)


genomefiles = sorted(glob.glob(inpath + "genome/sorted*" + args.stem + "*.sam"))
regfiles = sorted(glob.glob(inpath + "reg/sorted*" + args.stem + "*.sam"))

print genomefiles
print regfiles
    handle.write("swapped_circOrLinear\tswapped_decoyOrAnom\tswapped_unmapped\tswapped_pval\ttotal_reads\n")
    for j in junctions:
        handle.write(str(junctions[j]) + "\n")
    handle.close()
        
        
    
if __name__  == "__main__":
    parser = argparse.ArgumentParser()
    parser.add_argument('-a', '--dirA', help='directory containing original pipeline reports (parent of reports, ids, etc)', required=True)
    parser.add_argument('-b', '--dirB', help='directory containing swapped pipeline reports (parent of reports, ids, etc directories)', required=True)
    parser.add_argument('-q', '--fastqIdStyle', help='type of read ids used', required=True, choices=['appended', 'complete'])
    parser.add_argument('-v', '--verbose', help='print extra debugging info', action='store_true')
    args = parser.parse_args()
    
    utils_os.createDirectory("/".join([args.dirA, "combinedReports"]))
    
    for f in os.listdir("/".join([args.dirA, "reports"])):
        reportFileName = os.path.basename(f)
        if args.verbose:
            print reportFileName

        if patt_filename.search(reportFileName):
            idFileName = reportFileName.replace("report", "_output")  # could be denovo or annotated report file, all handled the same
            if args.verbose:
                print reportFileName
                print idFileName
            
            junctions = {}
            combineResults(reportFileName, idFileName)
    
        print len(allJunctions)


if __name__  == "__main__":
    
    parser = argparse.ArgumentParser()
    parser.add_argument('-w', '--window', help='sliding window size to create junctions', default=100000, type=int)
    parser.add_argument('-e', '--exonDir', help='directory containing exon pickle files', default='output/exons')
    parser.add_argument('-s', '--singleFile', help='path to single exon file that should be parsed for junctions')
    parser.add_argument('-r', '--recordDir', help='directory containing seq record pickle files', default='output/records')
    parser.add_argument('-f', '--fastaDir', help='directory to output junction fasta files, will be created if does not exist', default='output/fasta')
    parser.add_argument('-v', '--verbose', help='print extra debugging info', action='store_true')
    args = parser.parse_args()

    # create fastaDir if it doesn't exist
    utils_os.createDirectory(args.fastaDir)
    
    # only run for a single file
    if args.singleFile:
        if args.verbose:
            print "running for single file", args.singleFile
        exonObj = os.path.basename(args.singleFile)
        fileId = utils_os.getFileId(patt_exonfilename, 1, args.singleFile)  # usually something like chr# which is in the name of each created file
        createJunctions(fileId, exonObj)
    else:
        if args.verbose:
            print "running for directory", args.exonDir
        # or loop through files in exonDir to create junction file for each
        for exonObj in os.listdir(args.exonDir):
            if patt_exonfile.search(exonObj): # only parse if this is an exon pickled file
                fileId = utils_os.getFileId(patt_exonfilename, 1, exonObj)  # usually something like chr# which is in the name of each created file
Beispiel #9
0
 parser.add_argument('-v', '--verbose', help='print extra debugging info', action='store_true')
 args = parser.parse_args()
 
 if args.verbose:
     print "parentDir:", args.parentDir
     print "sampleId:", args.sampleId
     print "outDirName:", args.outDirName
     print "style:", args.fastqIdStyle
     print "a1:", args.aScore1
     print "a2:", args.aScore2
     print "overhang:", args.overhang
     print "id dir suffix:", args.junctionIdDirSuffix
     print "unaligned:", args.unalignedMode
 
 # make output dirs if they don't exist
 utils_os.createDirectory("/".join([args.parentDir, args.outDirName]))
 utils_os.createDirectory("/".join([args.parentDir, args.outDirName, "reports"]))  # these are the reports using the naive method
 utils_os.createDirectory("/".join([args.parentDir, args.outDirName, "glmReports"])) # GLM will be run later and those reports will be stored here
 utils_os.createDirectory("/".join([args.parentDir, args.outDirName, "glmModels"]))  # GLM will be run later and those models will be stored here
 utils_os.createDirectory("/".join([args.parentDir, args.outDirName, "ids"]))  # txt files of read ids assigned to circular or linear category
 # just doing read1s 
 if args.sampleId.endswith("1"):
 
     try:
         # populate the ignoreIds, regIds, nonRegIds for this file and also print out regIds and nonRegIds to juncNonGR file
         ignoreIds = {} # ribo and genome aligned 
         regIds = {} # regular junction overlapped and not ribo or genome aligned (aligned to reg-only index)
         nonRegIds = {} # junction overlapped and not ribo or genome aligned or regular-junction aligned
         
         # we treat denovo reads as the equivalent of the junction reads in unalignedMode
         if args.unalignedMode:
Beispiel #10
0
    parser.add_argument('-w', '--window', help='sliding window size to create junctions', default=1000000, type=int)
    parser.add_argument('-e', '--exonDir', help='directory containing exon pickle files', default='output/exons')
    parser.add_argument('-s', '--singleFile', help='path to single exon file that should be parsed for junctions')
    parser.add_argument('-r', '--recordDir', help='directory containing seq record pickle files',
                        default='output/records')
    parser.add_argument('-f', '--fastaDir',
                        help='directory to output junction fasta files, will be created if does not exist',
                        default='output/fasta')
    parser.add_argument('-n1', '--name1', help='name of field in gtf to use for gene names', default='gene_name')
    parser.add_argument('-n2', '--name2', help='name of field in gtf to use for gene names if n1 does not exist',
                        default='gene_id')
    parser.add_argument('-v', '--verbose', help='print extra debugging info', action='store_true')
    args = parser.parse_args()

    # create fastaDir if it doesn't exist
    utils_os.createDirectory(args.fastaDir)

    # only run for a single file
    if args.singleFile:
        if args.verbose:
            print "running for single file", args.singleFile
        exonObj = os.path.basename(args.singleFile)
        fileId = utils_os.getFileId(patt_exonfilename, 1,
                                    args.singleFile)  # usually something like chr# which is in the name of each created file
        createJunctions(fileId, exonObj)
    else:
        if args.verbose:
            print "running for directory", args.exonDir
        # or loop through files in exonDir to create junction file for each
        for exonObj in os.listdir(args.exonDir):
            if patt_exonfile.search(exonObj):  # only parse if this is an exon pickled file
Beispiel #11
0
        '--dirB',
        help=
        'directory containing swapped pipeline reports (parent of glmReports, ids, etc directories)',
        required=True)
    parser.add_argument('-q',
                        '--fastqIdStyle',
                        help='type of read ids used',
                        required=True,
                        choices=['appended', 'complete'])
    parser.add_argument('-v',
                        '--verbose',
                        help='print extra debugging info',
                        action='store_true')
    args = parser.parse_args()

    utils_os.createDirectory("/".join([args.dirA, "combinedReports"]))

    for f in os.listdir("/".join([args.dirA, "glmReports"])):
        reportFileName = os.path.basename(f)
        if args.verbose:
            print reportFileName

        if patt_GLMfilename.search(reportFileName):
            idFileName = reportFileName.replace(
                "circJuncProbs",
                "output").replace("linearJuncProbs",
                                  "output")  # could be linear or circular file
            if args.verbose:
                print reportFileName
                print idFileName
Beispiel #12
0
                        choices=['all', 'within', 'between'],
                        default='all')
    parser.add_argument('-p',
                        '--postpend',
                        help='text to add to end of output file names',
                        required=True)
    parser.add_argument('-v',
                        '--verbose',
                        help='print info about data obtained',
                        action='store_true')

    args = parser.parse_args()

    if args.fastaDir and args.singleFile:
        sys.exit("Only 1 of -d and -s can be specified")

    # create output directory if it does not exist
    utils_os.createDirectory(args.outDir)

    # fasta junction ids look like chr10|TTC40:134751179|MYC1:134722640|reg|-
    # where we are interested in TTC40, MYC1, and reg
    id_patt = re.compile(".+?\|(.+?):.+?\|(.+?):.+?\|(.+?)\|.*")

    if args.fastaDir:
        # loop through files directory
        for f in os.listdir(args.fastaDir):
            if f.endswith(".fa"):  # only parse if this is a fasta file
                writeLimitedFasta('/'.join([args.fastaDir, f]))
    elif args.singleFile:
        writeLimitedFasta(args.singleFile)
Beispiel #13
0
        '--recOutDir',
        help=
        'directory to output seq record pickle files, will be created within outDir if it does not exist',
        default='records')
    parser.add_argument('-v',
                        '--verbose',
                        help='print info about data obtained',
                        action='store_true')
    args = parser.parse_args()

    exonOutFullPath = '/'.join([args.outDir, args.exonOutDir])
    geneOutFullPath = '/'.join([args.outDir, args.geneOutDir])
    recOutFullPath = '/'.join([args.outDir, args.recOutDir])

    # create output directories if necessary
    utils_os.createDirectory(args.outDir)
    utils_os.createDirectory(exonOutFullPath)
    utils_os.createDirectory(geneOutFullPath)
    utils_os.createDirectory(recOutFullPath)

    # read in the sequences
    f_handle = open(args.fastaFile, "rU")
    f_dict = SeqIO.to_dict(SeqIO.parse(f_handle, "fasta"))
    f_handle.close()

    # only want exons for now. unfortunately can't limit by strand so have to do that after the fact
    # we can speed things up a bit by limiting to just the chromosomes we have sequences for
    limit_info = dict(gff_id=f_dict.keys(), gff_type=["exon"])

    if args.verbose:
        print "data we are searching for in gff: " + str(limit_info)
Beispiel #14
0
if __name__  == "__main__":
    
    parser = argparse.ArgumentParser()
    parser.add_argument('-d', '--fastaDir', help='directory containing fasta files to limit. Either -d or -s required.')
    parser.add_argument('-s', '--singleFile', help='fasta file to limit. Either -d or -s required.')
    parser.add_argument('-o', '--outDir', help='directory to output files. Will be created if it does not exist.', required=True)
    parser.add_argument('-t', '--juncType', help='type of junction to limit to', choices=['reg', 'rev', 'dup'], required=True)
    parser.add_argument('-b', '--bounds', help='type of junction to limit to', choices=['all', 'within', 'between'], default='all')
    parser.add_argument('-p', '--postpend', help='text to add to end of output file names', required=True)
    parser.add_argument('-v', '--verbose', help='print info about data obtained', action='store_true')
    
    args = parser.parse_args()
    
    if args.fastaDir and args.singleFile:
        sys.exit("Only 1 of -d and -s can be specified")
    
    # create output directory if it does not exist    
    utils_os.createDirectory(args.outDir)
    
    # fasta junction ids look like chr10|TTC40:134751179|MYC1:134722640|reg|-
    # where we are interested in TTC40, MYC1, and reg
    id_patt = re.compile(".+?\|(.+?):.+?\|(.+?):.+?\|(.+?)\|.*")
    
    if args.fastaDir:
        # loop through files directory
        for f in os.listdir(args.fastaDir):
            if f.endswith(".fa"): # only parse if this is a fasta file
                writeLimitedFasta('/'.join([args.fastaDir,f]))
    elif args.singleFile:
        writeLimitedFasta(args.singleFile)