def getGeneNamesFromGTF(): parser = OptionParser(usage="getGenesNames; type usage: %prog [options] -f filename") files = OptionGroup(parser, "File input options") files.add_option("-f", "--input_file", dest="gtf_file", help="Provide the path to your gtf data file. Default is standard input.", type="str", default=None) files.add_option("-g", "--genes", dest="genes", help="Which biotype of features to get: mRNA, tRNA, rRNA, snRNA, snoRNA", type="str", default='tRNA') files.add_option("-i", "--introns", dest="introns", help="Introns? both - not discriminate; int_cont -only intron containing; int_less - only int less", choices=["both", "int_cont", "int_less"], default="both"), files.add_option("-o", "--output_file", dest="output_file", help="Use this flag to provide an output file name. Default is standard output.", default=None) parser.add_option_group(files) (options, args) = parser.parse_args() ### By default, input and output are expected from the standard input or standard output. signal(SIGPIPE,SIG_DFL) outfile = sys.stdout if options.output_file: outfile = open(options.output_file, "w") gtf = GTF2.Parse_GTF() gtf.read_GTF(gtm.getGTF(options.gtf_file)) names_list = list() ### for loop extracting tRNA names for line in open(gtm.getGTF(options.gtf_file), "r"): if not line.startswith('#'): line_elements = line.strip().split('\t') # assert len(line_elements) == 10, 'Unexpected number of elements found in gtf line: ' + line if str(line_elements[1]) == options.genes: try: name = re.search("gene_name\s\"(.*?)\"", str(line_elements[8])).group(1) except: pass # name = re.search("gene_id\s\"(.*?)\"", str(line_elements[8])).group(1) if options.introns == "both": if name not in names_list: names_list.append(name) # outfile.write(str(name) + '\n') elif options.introns == "int_cont": if gtf.intronCoordinates(name): if name not in names_list: names_list.append(name) # outfile.write(str(name) + '\n') elif options.introns == "int_less": if not gtf.intronCoordinates(name): if name not in names_list: names_list.append(name) # outfile.write(str(name) + '\n') outfile.write('\n'.join(names_list)+'\n') outfile.close()
def getFastaSeqs(): parser = OptionParser(usage="List of genes as std input and parameters") parser.add_option("-g", "--gtf_file", dest="gtf_file", help="Provide the path to your gtf file.", type="str", default=None) parser.add_option("-f", "--fasta_file", dest="fasta_file", help="Provide the path to your fasta file.", type="str", default=None) parser.add_option("-t", "--tab_file", dest="tab_file", help="Provide the path to your genom tab file.", type="str", default=None) parser.add_option("-r", "--ranges", dest="ranges", help="Provide ranges(flanks) for genes.", type="int", default=0) parser.add_option("-a", "--5end", dest="five_end", help="Set up 5` flank. If minus then print only 3` end. Python slicing [a:b] i.e. [200:401] - from 200 to 400; [-200:] - last 200; " "[:-200] from begining till -200 before end", type="int", default=None) parser.add_option("-b", "--3end", dest="three_end", help="Set up 5` flank. If minus then print only 5` end. Python slicing [a:b]", type="int", default=None) (options, args) = parser.parse_args() signal(SIGPIPE,SIG_DFL) # to manage with stdin and stdout #crating gtf object gtf = GTF2.Parse_GTF() gtf.read_GTF(gtm.getGTF(options.gtf_file)) gtf.read_FASTA(gtm.getFASTA(options.fasta_file)) gtf.read_TAB(gtm.getTAB(options.tab_file)) for i in sys.stdin: gene_name = str(i.strip()) genomic_seq = gtf.genomicSequence(gene_name, ranges=options.ranges) print '>'+gene_name print genomic_seq[options.five_end:options.three_end]+'\n'
def getNameFromId4Tab(): parser = OptionParser(usage="usage: List of genes as std input") parser.add_option("-g", "--gtf_file", dest="gtf_file", help="Provide the path to your gtf file.", type="str", default=None) (options, args) = parser.parse_args() signal(SIGPIPE, SIG_DFL) gtf = GTF2.Parse_GTF() gtf.read_GTF(gtm.getGTF(options.gtf_file)) id_to_gene = dict() for gene_name in gtf.genes: gene_id = gtf.genes[gene_name]['gene_id'] id_to_gene[gene_id] = gene_name for i in sys.stdin: i_elem = i.strip().split("\t") gene_id = i_elem[0] seq = i_elem[1] gene_name = id_to_gene[gene_id] print gene_name + '\t' + seq
def mRNA(): usage = "Usage: To create input concat file run novo2concat.py" parser = argparse.ArgumentParser(usage=usage) files = parser.add_argument_group('Options for input files') files.add_argument("-g", "--gtf_file", dest="gtf_file", help="Provide the path to your gtf file.", type=str, default=None) files.add_argument("-i", "--input_file", dest="input_file", help="Provide the path to your concat file. REQUIRED.", metavar="FILE", default=None, required=True) files.add_argument("--5flank", dest="five_prime_flank", type=int, help="Set up 5 prime flank in pileup file. Default = 0", default=0) files.add_argument("--3flank", dest="three_prime_flank", type=int, help="Set up 3 prime flank in pileup file. Default = 0", default=0) universal = parser.add_argument_group('Universal options') universal.add_argument("-t", "--hits_threshold", dest="hits_threshold", type=int, help="Set up threshold for pileup. Default 0 reads", default=0) universal.add_argument("-n", "--normalized", dest="normalized", action="store_true", help="Use when you want to work on data normalized 'reads per Milion'. Default: False", default=False) output = parser.add_argument_group('Options for output files') output.add_argument("-p", "--prefix", dest="out_prefix", type=str, help="Prefix for output files. Default to standard output. Not supported for -o ratio.", default=None) output.add_argument("-o", dest="output_files", choices=['bind'], help="Select from following options:" "(1) Print binding windows in fasta file", default="bind") output.add_argument("--peaks", dest="print_peaks", action="store_true", help="print peaks on plots. Default: False", default=False) output.add_argument("--valleys", dest="print_valleys", action="store_true", help="print valleys on plots. Default: False", default=False) special = parser.add_argument_group('Special options for some -o choices') special.add_argument("--lookahead", dest="lookahead", type=int, help="Set up lookahead parameter for pypeaks function. Default = 20", default=20) special.add_argument("-w", "--window", dest="window", type=int, help="Set up size of window for bind calculation (-o bind). Default: 10", default=10) special.add_argument("-e", dest="experiment_to_use", type=str, help="For -o bind, which experiment to use.") # special.add_argument("--ntotal", dest="ntotal", action="store_true", help="Normalize data to sum of all reads (sum = 1). Default: False", default=False) # special.add_argument("--nmax", dest="nmax", action="store_true", help="Normalize data to maximal value (max = 1). Default: False", default=False) # special.add_argument("-a", dest="to_divide", type=str, help="experiment to divide by -b (-o fig_ratio)", # default=None) # special.add_argument("-b", dest="divisor", type=str, help="experiment being divisor for -a (-o fig_ratio)", # default=None) options = parser.parse_args() #checking input input_file = options.input_file #preparing naming of output files if options.out_prefix: prefix = options.out_prefix+'_' else: prefix = str() if options.normalized == True: prefix = 'normalized_'+prefix data = mRNAFromConcat(gtf_file=gtm.getGTF(options.gtf_file), five_prime_flank=options.five_prime_flank, three_prime_flank=options.three_prime_flank, hits_threshold=options.hits_threshold, lookahead=options.lookahead, prefix=prefix, npM=options.normalized) if options.output_files == "bind": #reading csv file data.read_csv(input_file, use='deletions') #calculating readthrough, details, normalize # data.calculate(details=options.details, ntotal=options.ntotal, nmax=options.nmax) data.bind(exp_to_use=options.experiment_to_use, window=options.window) print '# Done.'
def getFastaSeqs(): parser = OptionParser(usage="List of genes as std input and parameters") parser.add_option("-g", "--gtf_file", dest="gtf_file", help="Provide the path to your gtf file.", type="str", default=None) parser.add_option("-f", "--fasta_file", dest="fasta_file", help="Provide the path to your fasta file.", type="str", default=None) parser.add_option("-t", "--tab_file", dest="tab_file", help="Provide the path to your genom tab file.", type="str", default=None) parser.add_option("-r", "--ranges", dest="ranges", help="Provide ranges(flanks) for genes.", type="int", default=0) parser.add_option( "-a", "--5end", dest="five_end", help= "Set up 5` flank. If minus then print only 3` end. Python slicing [a:b] i.e. [200:401] - from 200 to 400; [-200:] - last 200; " "[:-200] from begining till -200 before end", type="int", default=None) parser.add_option( "-b", "--3end", dest="three_end", help= "Set up 5` flank. If minus then print only 5` end. Python slicing [a:b]", type="int", default=None) (options, args) = parser.parse_args() signal(SIGPIPE, SIG_DFL) # to manage with stdin and stdout #crating gtf object gtf = GTF2.Parse_GTF() gtf.read_GTF(gtm.getGTF(options.gtf_file)) gtf.read_FASTA(gtm.getFASTA(options.fasta_file)) gtf.read_TAB(gtm.getTAB(options.tab_file)) for i in sys.stdin: gene_name = str(i.strip()) genomic_seq = gtf.genomicSequence(gene_name, ranges=options.ranges) print '>' + gene_name print genomic_seq[options.five_end:options.three_end] + '\n' gtf.codingSequence()
def rRNA(): usage = "Usage: create pileups with pyPileup (pyCRAC package) then in directory containing pileup files type run i.e.:"+"\n"+ \ "cat file.concat | gwiderRNA.py or gwiderRNA.py -i file.concat" parser = OptionParser(usage=usage) parser.add_option("-g", "--gtf_file", dest="gtf_file", help="Provide the path to your gtf file.", metavar="FILE", default=None) parser.add_option("-i", dest="input_file", help="Provide the path to your concat file.", metavar="FILE", default=None) parser.add_option("--5flank", dest="five_prime_flank", type="int", help="Set up 5 prime flank in pileup file. Default = 1000", default=1000) parser.add_option("--3flank", dest="three_prime_flank", type="int", help="Set up 3 prime flank in pileup file. Default = 1000", default=1000) parser.add_option("-l", "--lookahead", dest="lookahead", type="int", help="Set up lookahead parameter for pypeaks function. Default = 20", default=20) parser.add_option("-t", "--hits_threshold", dest="hits_threshold", type="int", help="Set up threshold for pileup. Default 100 reads", default=100) # parser.add_option("-r", "--readthrough", dest="readthrough", type="int", help="Set up when readthrough should start countin. Default: 0", # default=0) parser.add_option("-p", "--prefix", dest="out_prefix", type="str", help="Prefix for output files. Default to standard output", default=None) parser.add_option("--peaks", dest="print_peaks", action="store_true", help="Add into command line if you want to print peaks on plots. Default: False", default=False) parser.add_option("-o", "--output", dest="output_files", choices=["std", "ratio", "single", "correlations", "ratio_smooth"], help="Select from following options: (1) std - RDN37-1; experiment after experimen ;"+'\n' "(2)ratio - ratio for -a divided by -b; (3)single - plot RDN37-1 plots 1 per page; (4) correlations - calculate correlations for different experiments; (5)ratio_smooth - ratio for -a divided by -b", default="std") parser.add_option("-a", dest="to_divide", type="str", help="experiment to divide by -b", default=None) parser.add_option("-b", dest="divisor", type="str", help="experiment being divisor for -a", default=None) parser.add_option("-n", "--normalized", dest="normalized", action="store_true", help="Use when you want to work on data normalized reads per Milion? Default: False", default=False) (options, args) = parser.parse_args() gtf_file = gtm.getGTF(options.gtf_file) if options.out_prefix: prefix = options.out_prefix+'_' else: prefix = str() if options.output_files == "ratio": options.normalized = True data = rRNAFromConcat(gtf_file=gtf_file, five_prime_flank=options.five_prime_flank, print_peaks=options.print_peaks, three_prime_flank=options.three_prime_flank, hits_threshold=options.hits_threshold, lookahead=options.lookahead, prefix=prefix, normalized=options.normalized) data.read_csv(options.input_file) data.slice_data() if options.print_peaks == True: data.find_peaks() if options.output_files == "std": data.print_rRNA() # RDN37 should be prepared with 1000 nt flanks if options.output_files == "single": data.single_rRNA() # RDN37 should be prepared with 1000 nt flanks if options.output_files == "ratio": # data.fig_ratio(options.to_divide, options.divisor) # plots ratio to_divide/divisor data.fig_log2ratio(options.to_divide, options.divisor) # plots log2 ratio to_divide/divisor if options.output_files == "ratio_smooth": data.fig_smoothlog2ratio(options.to_divide, options.divisor) # plots log2 ratio to_divide/divisor using smoothed data if options.output_files == "correlations": data.correlations() print '# Done.'
def getGeneLength(): parser = OptionParser(usage="usage: List of genes as std input") parser.add_option("-g", "--gtf_file", dest="gtf_file", help="Provide the path to your gtf file.", type="str", default=None) (options, args) = parser.parse_args() signal(SIGPIPE,SIG_DFL) gtf = GTF2.Parse_GTF() gtf.read_GTF(gtm.getGTF(options.gtf_file)) for i in sys.stdin: gene_name = str(i.strip()) gene_length = gtf.geneLength(gene_name) print gene_name+"\t"+str(gene_length)
def getGeneLength(): parser = OptionParser(usage="usage: List of genes as std input") parser.add_option("-g", "--gtf_file", dest="gtf_file", help="Provide the path to your gtf file.", type="str", default=None) (options, args) = parser.parse_args() signal(SIGPIPE, SIG_DFL) gtf = GTF2.Parse_GTF() gtf.read_GTF(gtm.getGTF(options.gtf_file)) for i in sys.stdin: gene_name = str(i.strip()) gene_length = gtf.geneLength(gene_name) print gene_name + "\t" + str(gene_length)
def getNameFromId(): parser = OptionParser(usage="usage: List of genes as std input") parser.add_option("-g", "--gtf_file", dest="gtf_file", help="Provide the path to your gtf file.", type="str", default=None) (options, args) = parser.parse_args() signal(SIGPIPE,SIG_DFL) gtf = GTF2.Parse_GTF() gtf.read_GTF(gtm.getGTF(options.gtf_file)) id_to_gene = dict() for gene_name in gtf.genes: gene_id = gtf.genes[gene_name]['gene_id'] id_to_gene[gene_id] = gene_name for i in sys.stdin: gene_id = str(i.strip()) gene_name = id_to_gene[gene_id] print gene_name
def hittable(): ## option parser usage = "For more options type -h" description = "Downstream analysis on hittables crated by pyReadCounter. Chose type of analysys Usage: create hittables using pyReadCounter then run script in the folder containing hittables" parser = argparse.ArgumentParser(usage=usage, description=description) #functions parser.add_argument('--output', required=True, dest="function", choices=['correlation', 'count', 'piechart'], help='REQUIRED, Calculate "correlations"; ' '"count" hittables for further analysis. Ideal to work with multiple experiments; ' 'Plot "piechart"s for hittable classes') # parser for input files options files = parser.add_argument_group('Input file options') files.add_argument("-g", dest="gtf_file", help="Provide the path to your gtf file.", type=str, default=None) files.add_argument("--stdin", dest="stdin", action="store_true", help="Use standard input instead ./*hittable* Default: False", default=False) # universal options universal = parser.add_argument_group('universal options') universal.add_argument("-n", dest="normalized", action="store_true", help="Use when you want to work on data normalized 'reads per Milion'. Default: False", default=False) universal.add_argument("-w", dest="whole_name", action="store_true", help="As defauls scripts takes 'a_b_c' from a_b_c_hittable_reads.txt as experiment name. Use this option if your file names do not suit to this pattern. Default: False", default=False) universal.add_argument("-p", dest="out_prefix", type=str, help="Prefix for output files.", default=None) # parser specific for counts corr_group = parser.add_argument_group("counts options") corr_group.add_argument("--rpkm", dest="rpkm", action="store_true", help="Use RPKM instead of hits. Default: False", default=False) # parser specific for correlations corr_group = parser.add_argument_group("correlation options") corr_group.add_argument("-c", dest="gene_class", action="store_true", help="Calculate Pearson coefficient for different classes separately. Default: False", default=False) corr_group.add_argument("-o", dest="output", choices=["p", "s", "k", "a"], help="Select from following options: p - Pearson (standard correlation coefficient); s - Spearman rank correlation; k - Kendall Tau correlation coefficient; a - all at once", default="p") #parser specific for piecharts piechart_group = parser.add_argument_group("piechart options") piechart_group.add_argument("-s", "--single", dest="print_single", help="Print hittables in single files", action="store_true", default=False) options = parser.parse_args() ## Creating HittableClass object data = ghc.HittableClass(gtf=gtm.getGTF(options.gtf_file), whole_name=options.whole_name, n_rpM=options.normalized, out_prefix=options.out_prefix, read_stdin=options.stdin) #running chosen function if options.function == 'correlation': data.correlation(output=options.output, gene_class=options.gene_class) elif options.function == 'count': data.count(normalize=options.normalized, use_RPKM=options.rpkm) elif options.function == 'piechart': data.plot(print_single=options.print_single) print "Done."
def rRNA(): usage = "Usage: create pileups with pyPileup (pyCRAC package) then in directory containing pileup files type run i.e.:"+"\n"+ \ "cat file.concat | gwiderRNA.py or gwiderRNA.py -i file.concat" parser = OptionParser(usage=usage) parser.add_option("-g", "--gtf_file", dest="gtf_file", help="Provide the path to your gtf file.", metavar="FILE", default=None) parser.add_option("-i", dest="input_file", help="Provide the path to your concat file.", metavar="FILE", default=None) parser.add_option( "--5flank", dest="five_prime_flank", type="int", help="Set up 5 prime flank in pileup file. Default = 1000", default=1000) parser.add_option( "--3flank", dest="three_prime_flank", type="int", help="Set up 3 prime flank in pileup file. Default = 1000", default=1000) parser.add_option( "-l", "--lookahead", dest="lookahead", type="int", help="Set up lookahead parameter for pypeaks function. Default = 20", default=20) parser.add_option("-t", "--hits_threshold", dest="hits_threshold", type="int", help="Set up threshold for pileup. Default 100 reads", default=100) # parser.add_option("-r", "--readthrough", dest="readthrough", type="int", help="Set up when readthrough should start countin. Default: 0", # default=0) parser.add_option( "-p", "--prefix", dest="out_prefix", type="str", help="Prefix for output files. Default to standard output", default=None) parser.add_option( "--peaks", dest="print_peaks", action="store_true", help= "Add into command line if you want to print peaks on plots. Default: False", default=False) parser.add_option( "-o", "--output", dest="output_files", choices=["std", "ratio", "single", "correlations", "ratio_smooth"], help= "Select from following options: (1) std - RDN37-1; experiment after experimen ;" + '\n' "(2)ratio - ratio for -a divided by -b; (3)single - plot RDN37-1 plots 1 per page; (4) correlations - calculate correlations for different experiments; (5)ratio_smooth - ratio for -a divided by -b", default="std") parser.add_option("-a", dest="to_divide", type="str", help="experiment to divide by -b", default=None) parser.add_option("-b", dest="divisor", type="str", help="experiment being divisor for -a", default=None) parser.add_option( "-n", "--normalized", dest="normalized", action="store_true", help= "Use when you want to work on data normalized reads per Milion? Default: False", default=False) (options, args) = parser.parse_args() gtf_file = gtm.getGTF(options.gtf_file) if options.out_prefix: prefix = options.out_prefix + '_' else: prefix = str() if options.output_files == "ratio": options.normalized = True data = rRNAFromConcat(gtf_file=gtf_file, five_prime_flank=options.five_prime_flank, print_peaks=options.print_peaks, three_prime_flank=options.three_prime_flank, hits_threshold=options.hits_threshold, lookahead=options.lookahead, prefix=prefix, normalized=options.normalized) data.read_csv(options.input_file) data.slice_data() if options.print_peaks == True: data.find_peaks() if options.output_files == "std": data.print_rRNA() # RDN37 should be prepared with 1000 nt flanks if options.output_files == "single": data.single_rRNA() # RDN37 should be prepared with 1000 nt flanks if options.output_files == "ratio": # data.fig_ratio(options.to_divide, options.divisor) # plots ratio to_divide/divisor data.fig_log2ratio( options.to_divide, options.divisor) # plots log2 ratio to_divide/divisor if options.output_files == "ratio_smooth": data.fig_smoothlog2ratio( options.to_divide, options.divisor ) # plots log2 ratio to_divide/divisor using smoothed data if options.output_files == "correlations": data.correlations() print '# Done.'
def hittable(): ## option parser usage = "For more options type -h" description = "Downstream analysis on hittables crated by pyReadCounter. Chose type of analysys Usage: create hittables using pyReadCounter then run script in the folder containing hittables" parser = argparse.ArgumentParser(usage=usage, description=description) #functions parser.add_argument( '--output', required=True, dest="function", choices=['correlation', 'count', 'piechart', 'classes'], help='REQUIRED, Calculate "correlations"; ' '"count" hittables for further analysis. Ideal to work with multiple experiments; ' 'Plot "piechart"s for hittable classes') # parser for input files options files = parser.add_argument_group('Input file options') files.add_argument("-g", dest="gtf_file", help="Provide the path to your gtf file.", type=str, default=None) files.add_argument( "--stdin", dest="stdin", action="store_true", help="Use standard input instead ./*hittable* Default: False", default=False) # universal options universal = parser.add_argument_group('universal options') universal.add_argument( "-n", dest="normalized", action="store_true", help= "Use when you want to work on data normalized 'reads per Milion'. Default: False", default=False) universal.add_argument( "-w", dest="whole_name", action="store_true", help= "As defauls scripts takes 'a_b_c' from a_b_c_hittable_reads.txt as experiment name. Use this option if your file names do not suit to this pattern. Default: False", default=False) universal.add_argument("-p", dest="out_prefix", type=str, help="Prefix for output files.", default=None) # parser specific for counts corr_group = parser.add_argument_group("counts options") corr_group.add_argument("--rpkm", dest="rpkm", action="store_true", help="Use RPKM instead of hits. Default: False", default=False) # parser specific for correlations corr_group = parser.add_argument_group("correlation options") corr_group.add_argument( "-c", dest="gene_class", action="store_true", help= "Calculate Pearson coefficient for different classes separately. Default: False", default=False) corr_group.add_argument( "-o", dest="output", choices=["p", "s", "k", "a"], help= "Select from following options: p - Pearson (standard correlation coefficient); s - Spearman rank correlation; k - Kendall Tau correlation coefficient; a - all at once", default="p") #parser specific for piecharts piechart_group = parser.add_argument_group("piechart options") piechart_group.add_argument("-s", "--single", dest="print_single", help="Print hittables in single files", action="store_true", default=False) options = parser.parse_args() ## Creating HittableClass object data = ghc.HittableClass(gtf=gtm.getGTF(options.gtf_file), whole_name=options.whole_name, n_rpM=options.normalized, out_prefix=options.out_prefix, read_stdin=options.stdin) #running chosen function if options.function == 'correlation': data.correlation(output=options.output, gene_class=options.gene_class) elif options.function == 'count': data.count(normalize=options.normalized, use_RPKM=options.rpkm) elif options.function == 'piechart': data.plot(print_single=options.print_single) elif options.function == 'classes': data.classes_to_tab() print "Done."
def tRNA(): usage = "Usage: UNDER CONSTRUCTION - not all functions are available. To create concat file run novo2concat" parser = argparse.ArgumentParser(usage=usage) files = parser.add_argument_group('Options for input files') files.add_argument("-g", "--gtf_file", dest="gtf_file", help="Provide the path to your gtf file.", type=str, default=None) files.add_argument("-i", "--input_file", dest="input_file", help="Provide the path to your concat file. REQUIRED.", metavar="FILE", default=None, required=True) files.add_argument( "--5flank", dest="five_prime_flank", type=int, help="Set up 5 prime flank in pileup file. Default = 250", default=250) files.add_argument( "--3flank", dest="three_prime_flank", type=int, help="Set up 3 prime flank in pileup file. Default = 250", default=250) universal = parser.add_argument_group('Universal options') universal.add_argument("-t", "--hits_threshold", dest="hits_threshold", type=int, help="Set up threshold for pileup. Default 0 reads", default=0) universal.add_argument("-r", "--readthrough", dest="readthrough", type=int, help="Set up when readthrough starts. Default: 15", default=15) universal.add_argument( "-n", "--normalized", dest="normalized", action="store_true", help= "Use when you want to work on data normalized 'reads per Milion'. Default: False", default=False) output = parser.add_argument_group('Options for output files') output.add_argument( "-p", "--prefix", dest="out_prefix", type=str, help= "Prefix for output files. Default to standard output. Not supported for -o ratio.", default=None) output.add_argument( "-d", "--details", dest="details", action="store_true", help= "Print details in text file. WARNING: works only with one experiment.", default=False) output.add_argument( "-o", dest="output_files", choices=[ "fig", "fig_std", "fig_tight", "fig_ratio", "fig_boxes", "nuc3", "nuc5", "nuc_gene", "nuc_energy", "termination_valleys", "termination", "termination_text", "stat_text", "both" ], help="Select from following options:" "(1) fig - plot tRNA genes coverage; (2) figstd - plot gene after gene; (3) fig_tight; (4) fig_ratio - log2 for -a divided by -b, uses normalized data" "(5) fig_boxes - mark A and B boxes (provide path to boxes position file)" "(6) nuc3 - for nucleotide 3' end resolution; (7) nuc_gene - for nucleotide resolution of gene only" "(8) nuc_energy - plots 3' end under nucleotide resolution with energy plots" "(9) termination_valleys - for each valley calculate termination efficiency" "(10) termination - calculate termination efficiency for last 20 nt" "(11) termination_text - calculate termination efficiency for first 20 nt of 3' end and print text file" "(12) stat_text - tab-deliminated; (13) both - fig and stat_text; (14) - nuc5 - for nucleotide 5' end resolution", default="both") output.add_argument("--peaks", dest="print_peaks", action="store_true", help="print peaks on plots. Default: False", default=False) output.add_argument("--valleys", dest="print_valleys", action="store_true", help="print valleys on plots. Default: False", default=False) output.add_argument("--mark", dest="mark", choices=["A", "T", "C", "G"], help="mark nucleotide on plots. Default: None", default=None) special = parser.add_argument_group('Special options for some -o choices') special.add_argument( "--lookahead", dest="lookahead", type=int, help="Set up lookahead parameter for pypeaks function. Default = 20", default=20) special.add_argument( "-w", "--window", dest="window", type=int, help= "Set up size of window for energy calculation (-o nuc_energy). Default: 5", default=5) special.add_argument( "--ntotal", dest="ntotal", action="store_true", help="Normalize data to sum of all reads (sum = 1). Default: False", default=False) special.add_argument( "--nmax", dest="nmax", action="store_true", help="Normalize data to maximal value (max = 1). Default: False", default=False) special.add_argument("-a", dest="to_divide", type=str, help="experiment to divide by -b (-o fig_ratio)", default=None) special.add_argument("-b", dest="divisor", type=str, help="experiment being divisor for -a (-o fig_ratio)", default=None) special.add_argument( "--abox", dest="abox_file", help="Provide the path to your tab file with A box start.", metavar="FILE", default=None) special.add_argument( "--bbox", dest="bbox_file", help="Provide the path to your tab file with B box start.", metavar="FILE", default=None) options = parser.parse_args() #checking input input_file = options.input_file if options.output_files == 'fig_boxes' and (options.abox_file == None or options.bbox_file == None): exit( 'Please provide path to both box.tab files using options --abox and --bbox.' ) if options.output_files == 'fig_ratio' and (options.to_divide == None or options.divisor == None): exit('Please provide experiments names using options -a and -b.') #preparing naming of output files if options.out_prefix: prefix = options.out_prefix + '_' filename = options.out_prefix + '_rt' + str( options.readthrough) + '_l' + str(options.lookahead) + '_t' + str( options.hits_threshold) + '.list' else: prefix = str() filename = 'rt' + str(options.readthrough) + '_l' + str( options.lookahead) + '_t' + str(options.hits_threshold) + '.list' if options.print_peaks == True: prefix = prefix + 'peaks_' if options.print_valleys == True: prefix = prefix + 'valleys_' if options.normalized == True: prefix = 'normalized_' + prefix #setting up dependencies if options.output_files == "fig_ratio": options.normalized = True if options.output_files == 'termination_valleys': options.print_peaks = True options.print_valleys = True data = tRNAFromConcatv2(gtf_file=gtm.getGTF(options.gtf_file), five_prime_flank=options.five_prime_flank, print_valleys=options.print_valleys, print_peaks=options.print_peaks, readthrough_start=options.readthrough, three_prime_flank=options.three_prime_flank, hits_threshold=options.hits_threshold, lookahead=options.lookahead, prefix=prefix, normalized=options.normalized) #reading csv file if options.output_files != "fig_ratio": data.read_csv(input_file, null_substitution=False) # elif options.output_files == "fig_ratio": # data.read_csv(input_file, null_substitution=True) ## makes all 0 as 1 in hittable #finding peaks if (options.print_peaks == True or options.print_valleys == True) and options.output_files != "fig_ratio": data.find_peaks() #calculating readthrough, details, normalize if options.output_files != "fig_ratio": data.calculate(details=options.details, ntotal=options.ntotal, nmax=options.nmax) elif options.output_files == "fig_ratio": data.calculate(details=options.details, ntotal=options.ntotal, nmax=options.nmax, pscounts=True) #making text files if options.output_files == "stat_text" or options.output_files == "both": text_file = open(filename, "w") data.make_text_file(text_file, details=options.details, ntotal=options.ntotal, nmax=options.nmax) if options.output_files == "fig" or options.output_files == "both": data.slice_dataframe() data.fig_gene_pp() if options.output_files == "fig_tight": data.slice_dataframe() data.fig_gene_pp_tight() #marks all T on the plots if options.output_files == "markT": data.slice_dataframe() data.mark_T() #marks all T and CG on the plots if options.output_files == "markTCG": data.slice_dataframe() data.mark_T(anti_plot=True) if options.output_files == "fig_ratio": data.slice_dataframe() data.fig_ratio(options.to_divide, options.divisor) if options.output_files == "nuc3": data.fig_3end_nucleotide_resolution() if options.output_files == "nuc5": data.fig_5end_nucleotide_resolution() if options.output_files == "nuc_gene": print 'Needs update. Talk to Tomasz.' # data.fig_nucleotide_gene() # if options.output_files == "nuc_energy": # data.fig_energy(options.window) if options.output_files == "fig_std": data.slice_dataframe() data.fig_gene_after_gene() if options.output_files == "fig_boxes": print 'Needs update. Talk to Tomasz.' data.slice_dataframe() # data.fig_boxes(open(options.abox_file), open(options.bbox_file)) # if options.output_files == "termination_valleys": print 'Needs update. Talk to Tomasz.' data.slice_dataframe() # data.termination_efficency_valleys() # if options.output_files == "termination": print 'Needs update. Talk to Tomasz.' data.slice_dataframe() # data.termination_efficency() if options.output_files == "termination_text": data.calculate_dG() text_file = open(filename, "w") data.make_text_file(text_file, print_dG=True) print '# Done.'
def tRNA(): usage = "Usage: UNDER CONSTRUCTION - not all functions are available. To create concat file run novo2concat" parser = argparse.ArgumentParser(usage=usage) files = parser.add_argument_group('Options for input files') files.add_argument("-g", "--gtf_file", dest="gtf_file", help="Provide the path to your gtf file.", type=str, default=None) files.add_argument("-i", "--input_file", dest="input_file", help="Provide the path to your concat file. REQUIRED.", metavar="FILE", default=None, required=True) files.add_argument("--5flank", dest="five_prime_flank", type=int, help="Set up 5 prime flank in pileup file. Default = 250", default=250) files.add_argument("--3flank", dest="three_prime_flank", type=int, help="Set up 3 prime flank in pileup file. Default = 250", default=250) universal = parser.add_argument_group('Universal options') universal.add_argument("-t", "--hits_threshold", dest="hits_threshold", type=int, help="Set up threshold for pileup. Default 0 reads", default=0) universal.add_argument("-r", "--readthrough", dest="readthrough", type=int, help="Set up when readthrough starts. Default: 15", default=15) universal.add_argument("-n", "--normalized", dest="normalized", action="store_true", help="Use when you want to work on data normalized 'reads per Milion'. Default: False", default=False) output = parser.add_argument_group('Options for output files') output.add_argument("-p", "--prefix", dest="out_prefix", type=str, help="Prefix for output files. Default to standard output. Not supported for -o ratio.", default=None) output.add_argument("-d", "--details", dest="details", action="store_true", help="Print details in text file. WARNING: works only with one experiment.", default=False) output.add_argument("-o", dest="output_files", choices=["fig", "fig_std", "fig_tight", "fig_ratio", "fig_boxes", "nuc3", "nuc5", "nuc_gene", "nuc_energy", "termination_valleys", "termination", "termination_text", "stat_text", "both"], help="Select from following options:" "(1) fig - plot tRNA genes coverage; (2) figstd - plot gene after gene; (3) fig_tight; (4) fig_ratio - log2 for -a divided by -b, uses normalized data" "(5) fig_boxes - mark A and B boxes (provide path to boxes position file)" "(6) nuc3 - for nucleotide 3' end resolution; (7) nuc_gene - for nucleotide resolution of gene only" "(8) nuc_energy - plots 3' end under nucleotide resolution with energy plots" "(9) termination_valleys - for each valley calculate termination efficiency" "(10) termination - calculate termination efficiency for last 20 nt" "(11) termination_text - calculate termination efficiency for first 20 nt of 3' end and print text file" "(12) stat_text - tab-deliminated; (13) both - fig and stat_text; (14) - nuc5 - for nucleotide 5' end resolution", default="both") output.add_argument("--peaks", dest="print_peaks", action="store_true", help="print peaks on plots. Default: False", default=False) output.add_argument("--valleys", dest="print_valleys", action="store_true", help="print valleys on plots. Default: False", default=False) output.add_argument("--mark", dest="mark", choices=["A","T","C","G"], help="mark nucleotide on plots. Default: None", default=None) special = parser.add_argument_group('Special options for some -o choices') special.add_argument("--lookahead", dest="lookahead", type=int, help="Set up lookahead parameter for pypeaks function. Default = 20", default=20) special.add_argument("-w", "--window", dest="window", type=int, help="Set up size of window for energy calculation (-o nuc_energy). Default: 5", default=5) special.add_argument("--ntotal", dest="ntotal", action="store_true", help="Normalize data to sum of all reads (sum = 1). Default: False", default=False) special.add_argument("--nmax", dest="nmax", action="store_true", help="Normalize data to maximal value (max = 1). Default: False", default=False) special.add_argument("-a", dest="to_divide", type=str, help="experiment to divide by -b (-o fig_ratio)", default=None) special.add_argument("-b", dest="divisor", type=str, help="experiment being divisor for -a (-o fig_ratio)", default=None) special.add_argument("--abox", dest="abox_file", help="Provide the path to your tab file with A box start.", metavar="FILE", default=None) special.add_argument("--bbox", dest="bbox_file", help="Provide the path to your tab file with B box start.", metavar="FILE", default=None) options = parser.parse_args() #checking input input_file = options.input_file if options.output_files == 'fig_boxes' and ( options.abox_file == None or options.bbox_file == None ): exit('Please provide path to both box.tab files using options --abox and --bbox.') if options.output_files == 'fig_ratio' and ( options.to_divide == None or options.divisor == None ): exit('Please provide experiments names using options -a and -b.') #preparing naming of output files if options.out_prefix: prefix = options.out_prefix+'_' filename = options.out_prefix+'_rt'+str(options.readthrough)+'_l'+str(options.lookahead)+'_t'+str(options.hits_threshold)+'.list' else: prefix = str() filename = 'rt'+str(options.readthrough)+'_l'+str(options.lookahead)+'_t'+str(options.hits_threshold)+'.list' if options.print_peaks == True: prefix = prefix+'peaks_' if options.print_valleys == True: prefix = prefix+'valleys_' if options.normalized == True: prefix = 'normalized_'+prefix #setting up dependencies if options.output_files == "fig_ratio": options.normalized = True if options.output_files == 'termination_valleys': options.print_peaks = True options.print_valleys = True data = tRNAFromConcatv2(gtf_file=gtm.getGTF(options.gtf_file), five_prime_flank=options.five_prime_flank, print_valleys=options.print_valleys, print_peaks=options.print_peaks, readthrough_start=options.readthrough, three_prime_flank=options.three_prime_flank, hits_threshold=options.hits_threshold, lookahead=options.lookahead, prefix=prefix, normalized=options.normalized) #reading csv file if options.output_files != "fig_ratio": data.read_csv(input_file, null_substitution=False) # elif options.output_files == "fig_ratio": # data.read_csv(input_file, null_substitution=True) ## makes all 0 as 1 in hittable #finding peaks if (options.print_peaks == True or options.print_valleys == True ) and options.output_files != "fig_ratio": data.find_peaks() #calculating readthrough, details, normalize if options.output_files != "fig_ratio": data.calculate(details=options.details, ntotal=options.ntotal, nmax=options.nmax) elif options.output_files == "fig_ratio": data.calculate(details=options.details, ntotal=options.ntotal, nmax=options.nmax, pscounts=True) #making text files if options.output_files == "stat_text" or options.output_files == "both": text_file = open(filename, "w") data.make_text_file(text_file, details=options.details, ntotal=options.ntotal, nmax=options.nmax) if options.output_files == "fig" or options.output_files == "both": data.slice_dataframe() data.fig_gene_pp() if options.output_files == "fig_tight": data.slice_dataframe() data.fig_gene_pp_tight() #marks all T on the plots if options.output_files == "markT": data.slice_dataframe() data.mark_T() #marks all T and CG on the plots if options.output_files == "markTCG": data.slice_dataframe() data.mark_T(anti_plot=True) if options.output_files == "fig_ratio": data.slice_dataframe() data.fig_ratio(options.to_divide, options.divisor) if options.output_files == "nuc3": data.fig_3end_nucleotide_resolution() if options.output_files == "nuc5": data.fig_5end_nucleotide_resolution() if options.output_files == "nuc_gene": print 'Needs update. Talk to Tomasz.' # data.fig_nucleotide_gene() # if options.output_files == "nuc_energy": # data.fig_energy(options.window) if options.output_files == "fig_std": data.slice_dataframe() data.fig_gene_after_gene() if options.output_files == "fig_boxes": print 'Needs update. Talk to Tomasz.' data.slice_dataframe() # data.fig_boxes(open(options.abox_file), open(options.bbox_file)) # if options.output_files == "termination_valleys": print 'Needs update. Talk to Tomasz.' data.slice_dataframe() # data.termination_efficency_valleys() # if options.output_files == "termination": print 'Needs update. Talk to Tomasz.' data.slice_dataframe() # data.termination_efficency() if options.output_files == "termination_text": data.calculate_dG() text_file = open(filename, "w") data.make_text_file(text_file, print_dG=True) print '# Done.'
def plot(): """ Script working with concat file generated by pileupsToConcat.py script. Read concat file and according to options. Can plot intron, and peaks found by pypeaks script.""" #setup option parser usage = "Usage: gwide function -i input -o output [options]" parser = argparse.ArgumentParser(usage=usage) files = parser.add_argument_group('Options for input files') files.add_argument("-g", "--gtf_file", dest="gtf_file", help="Provide the path to your gtf file.", type=str, default=None) files.add_argument("-i", "--input_file", dest="input_file", help="Provide the path to your input file. Required.", metavar="FILE", default=None, required=True) # files.add_argument("--input_type", dest="input_type", choices=['concat'], help="Type of input file. Default: concat", # type=str, default='concat') files.add_argument("--5flank", dest="five_prime_flank", type=int, help="Set up 5 prime flank. Default = 250", default=250) files.add_argument("--3flank", dest="three_prime_flank", type=int, help="Set up 3 prime flank. Default = 250", default=250) files.add_argument( "-l", "--list_file", dest="list_file", help= "Provide the path to your (tab) file genes.list. Only listed genes will be plotted. Can be aligned as second column", type=str) peaks = parser.add_argument_group('Option for peaks finder (pypeaks') peaks.add_argument( "--lookahead", dest="lookahead", type=int, help="Set up lookahead parameter for pypeaks function. Default = 20", default=20) universal = parser.add_argument_group('Universal options') universal.add_argument( "-t", "--hits_threshold", dest="hits_threshold", type=int, help= "Set up threshold for pileup. Default 100 reads. Genes with highest peak below are not included", default=0) universal.add_argument( "-r", "--readthrough", dest="readthrough", type=int, help="Set up nt when readthrough should start countin. Default: 15", default=15) universal.add_argument( "-n", "--normalized", dest="normalized", action="store_true", help="to work on data normalized 'reads per Milion'. Default: False", default=False) output = parser.add_argument_group('Options for output files') output.add_argument( "-p", "--prefix", dest="out_prefix", type=str, help= "Prefix for output files. Default to standard output. Not supported for -o ratio.", default=None) output.add_argument( "-o", "--output", dest="output", choices=[ "std", "ratio", "aligner", "RTendalign", "table", "Tdensity", "makeGTF", "transcript_length", "makeRTGTF" ], help="Select from following options:" + '\n' "(1) std - 5` and 3` end aligned only; (2) ratio - plot gwide ratio a exp / b exp" + '\n' "(3) aligner - std plus chosen aligner from file (-l option)" + '\n' "(4) RTendalign - std and aligned to 3` end of read-through (-l option). -e works to choose experiment to align and filter" + '\n' "(5) table - make *.csv file to plot heatmaps; (6) Tdensity - calculate p-value for non-canonical termination" + '\n' "(7) makeGTF - make GTF file with transcripts length ; (8) transcript_length - save *.txt file with trancripts length for all experiment; " "(9) makeRTGTF - make GTF with tRNA extensions only", default="std") special = parser.add_argument_group('Special options for some -o choices') special.add_argument( "--ntotal", dest="ntotal", action="store_true", help="Normalize to sum of all reads (sum = 1). Default: False", default=False) special.add_argument( "--nmax", dest="nmax", action="store_true", help="Normalize to maximal value (max = 1). Default: False", default=False) special.add_argument( "--publish", dest="publish", action="store_true", help= "Print plots as separate figures in publication quality. Works with -o ratio and std", default=False) special.add_argument( "--LRR", dest="left_right_ratio", action="store_true", help= "Print ratio between left and right part of the metaprofiles (before and after aligning line)", default=False) special.add_argument( "-f", dest="filter", type=str, help= "Filter in results factor_above_value; type i.e. RT_above_0.25 or a_below_1.5. To chose: RT, a, b, i, e, f, intron", default=None) special.add_argument( "-e", dest="experiment", type=str, help="Filter according to values from one experiment only", default=None) special.add_argument("-a", dest="to_divide", type=str, help="experiment to divide by -b (-o ratio)", default=None) special.add_argument("-b", dest="divisor", type=str, help="experiment being divisor for -a (-o ratio)", default=None) special.add_argument( "--select", dest="select", type=str, help= "To print additional plot with selecter area and no titles keep form 200_300 (range from 200 to 300)", default=None) special.add_argument( "--peak_min", dest="peak_min", type=int, help="minimum of peak average for -o Tdensity. Default = 300", default=300) special.add_argument("--peak_size", dest="peak_size", type=int, help="peak size for -o Tdensity. Default = 20", default=20) options = parser.parse_args() gtf_file = gtm.getGTF(options.gtf_file) list_file = options.list_file #preparing naming of output files if options.out_prefix: prefix = options.out_prefix + '_' filename = options.out_prefix + '_rt' + str( options.readthrough) + '_l' + str(options.lookahead) + '_t' + str( options.hits_threshold) + '.list' else: prefix = str() filename = 'rt' + str(options.readthrough) + '_l' + str( options.lookahead) + '_t' + str(options.hits_threshold) + '.list' if options.normalized == True: prefix = 'nRpM_' + prefix data = GenomeWidePlot(gtf_file=gtf_file, five_prime_flank=options.five_prime_flank, readthrough_start=options.readthrough, three_prime_flank=options.three_prime_flank, hits_threshold=options.hits_threshold, lookahead=options.lookahead, prefix=prefix, normalized=options.normalized, publish=options.publish, left_right_ratio=options.left_right_ratio) #setting up dependencies if options.output == "ratio": options.normalized = True #reading csv file data.read_csv(options.input_file, skip_nucleotide=True) #plotting if options.output == 'std': data.calculate(details=False, ntotal=True, nmax=True) data.std(filter=options.filter, experiment_to_filter=options.experiment) if options.ntotal == True: data.std(filter=options.filter, experiment_to_filter=options.experiment, exp_to_use='_ntotal') if options.nmax == True: data.std(filter=options.filter, experiment_to_filter=options.experiment, exp_to_use='_nmax') if options.output == 'aligner': if not list_file: print "Please provide path how to align files using -l file.list" else: data.calculate(details=True, ntotal=False, nmax=False) data.read_list(list_file) data.aligner(file=os.path.basename(list_file), filter=options.filter, experiment_to_filter=options.experiment) if options.output == 'RTendalign': data.calculate(details=True, ntotal=False, nmax=False) data.RT_aligner(filter=options.filter, experiment_to_align=options.experiment) if options.output == "ratio": data.calculate(details=False, ntotal=True, nmax=True, pscounts=True) if options.ntotal == True: data.ratio(to_divide=options.to_divide, divisor=options.divisor, exp_to_use='_ntotal', filter=options.filter) if options.select: data.ratio(to_divide=options.to_divide, divisor=options.divisor, exp_to_use='_ntotal', select=options.select, filter=options.filter) if options.nmax == True: data.ratio(to_divide=options.to_divide, divisor=options.divisor, exp_to_use='_nmax', filter=options.filter) data.ratio(to_divide=options.to_divide, divisor=options.divisor, filter=options.filter) if options.output == "makeRTGTF": data.find_peaks() data.makeRTGTF() if options.output == "table": data.table(filter=options.filter, experiment_to_filter=options.experiment) if options.output == "Tdensity": data.find_peaks() # data.calculate() data.Tdensity(peak_min=options.peak_min, size=options.peak_size) if options.output == "makeGTF": data.find_peaks() data.maketranscriptGTF() if options.output == "transcript_length": data.find_peaks() data.printTrancriptLength() print '# Done.'
def plot(): """ Script working with concat file generated by pileupsToConcat.py script. Read concat file and according to options. Can plot intron, and peaks found by pypeaks script.""" #setup option parser usage = "Usage: gwide function -i input -o output [options]" parser = argparse.ArgumentParser(usage=usage) files = parser.add_argument_group('Options for input files') files.add_argument("-g", "--gtf_file", dest="gtf_file", help="Provide the path to your gtf file.", type=str, default=None) files.add_argument("-i", "--input_file", dest="input_file", help="Provide the path to your input file. Required.", metavar="FILE", default=None, required=True) # files.add_argument("--input_type", dest="input_type", choices=['concat'], help="Type of input file. Default: concat", # type=str, default='concat') files.add_argument("--5flank", dest="five_prime_flank", type=int, help="Set up 5 prime flank. Default = 250", default=250) files.add_argument("--3flank", dest="three_prime_flank", type=int, help="Set up 3 prime flank. Default = 250", default=250) files.add_argument("-l", "--list_file", dest="list_file", help="Provide the path to your (tab) file genes.list. Only listed genes will be plotted. Can be aligned as second column", type=str) peaks = parser.add_argument_group('Option for peaks finder (pypeaks') peaks.add_argument("--lookahead", dest="lookahead", type=int, help="Set up lookahead parameter for pypeaks function. Default = 20", default=20) universal = parser.add_argument_group('Universal options') universal.add_argument("-t", "--hits_threshold", dest="hits_threshold", type=int, help="Set up threshold for pileup. Default 100 reads. Genes with highest peak below are not included", default=0) universal.add_argument("-r", "--readthrough", dest="readthrough", type=int, help="Set up nt when readthrough should start countin. Default: 15", default=15) universal.add_argument("-n", "--normalized", dest="normalized", action="store_true", help="to work on data normalized 'reads per Milion'. Default: False", default=False) output = parser.add_argument_group('Options for output files') output.add_argument("-p", "--prefix", dest="out_prefix", type=str, help="Prefix for output files. Default to standard output. Not supported for -o ratio.", default=None) output.add_argument("-o", "--output", dest="output", choices=["std", "ratio", "aligner", "RTendalign", "table", "Tdensity", "makeGTF", "transcript_length", "makeRTGTF"], help="Select from following options:"+'\n' "(1) std - 5` and 3` end aligned only; (2) ratio - plot gwide ratio a exp / b exp"+'\n' "(3) aligner - std plus chosen aligner from file (-l option)"+'\n' "(4) RTendalign - std and aligned to 3` end of read-through (-l option). -e works to choose experiment to align and filter"+'\n' "(5) table - make *.csv file to plot heatmaps; (6) Tdensity - calculate p-value for non-canonical termination"+'\n' "(7) makeGTF - make GTF file with transcripts length ; (8) transcript_length - save *.txt file with trancripts length for all experiment; " "(9) makeRTGTF - make GTF with tRNA extensions only", default="std") special = parser.add_argument_group('Special options for some -o choices') special.add_argument("--ntotal", dest="ntotal", action="store_true", help="Normalize to sum of all reads (sum = 1). Default: False", default=False) special.add_argument("--nmax", dest="nmax", action="store_true", help="Normalize to maximal value (max = 1). Default: False", default=False) special.add_argument("--publish", dest="publish", action="store_true", help="Print plots as separate figures in publication quality. Works with -o ratio and std", default=False) special.add_argument("--LRR", dest="left_right_ratio", action="store_true", help="Print ratio between left and right part of the metaprofiles (before and after aligning line)", default=False) special.add_argument("-f", dest="filter", type=str, help="Filter in results factor_above_value; type i.e. RT_above_0.25 or a_below_1.5. To chose: RT, a, b, i, e, f, intron", default=None) special.add_argument("-e", dest="experiment", type=str, help="Filter according to values from one experiment only", default=None) special.add_argument("-a", dest="to_divide", type=str, help="experiment to divide by -b (-o ratio)", default=None) special.add_argument("-b", dest="divisor", type=str, help="experiment being divisor for -a (-o ratio)", default=None) special.add_argument("--select", dest="select", type=str, help="To print additional plot with selecter area and no titles keep form 200_300 (range from 200 to 300)", default=None) special.add_argument("--peak_min", dest="peak_min", type=int, help="minimum of peak average for -o Tdensity. Default = 300", default=300) special.add_argument("--peak_size", dest="peak_size", type=int, help="peak size for -o Tdensity. Default = 20", default=20) options = parser.parse_args() gtf_file = gtm.getGTF(options.gtf_file) list_file = options.list_file #preparing naming of output files if options.out_prefix: prefix = options.out_prefix+'_' filename = options.out_prefix+'_rt'+str(options.readthrough)+'_l'+str(options.lookahead)+'_t'+str(options.hits_threshold)+'.list' else: prefix = str() filename = 'rt'+str(options.readthrough)+'_l'+str(options.lookahead)+'_t'+str(options.hits_threshold)+'.list' if options.normalized == True: prefix = 'nRpM_'+prefix data = GenomeWidePlot(gtf_file=gtf_file, five_prime_flank=options.five_prime_flank, readthrough_start=options.readthrough, three_prime_flank=options.three_prime_flank, hits_threshold=options.hits_threshold, lookahead=options.lookahead, prefix=prefix, normalized=options.normalized, publish=options.publish, left_right_ratio=options.left_right_ratio) #setting up dependencies if options.output == "ratio": options.normalized = True #reading csv file data.read_csv(options.input_file, skip_nucleotide=True) #plotting if options.output == 'std': data.calculate(details=False, ntotal=True, nmax=True) data.std(filter=options.filter, experiment_to_filter=options.experiment) if options.ntotal == True: data.std(filter=options.filter, experiment_to_filter=options.experiment, exp_to_use='_ntotal') if options.nmax == True: data.std(filter=options.filter, experiment_to_filter=options.experiment, exp_to_use='_nmax') if options.output == 'aligner': if not list_file: print "Please provide path how to align files using -l file.list" else: data.calculate(details=True, ntotal=False, nmax=False) data.read_list(list_file) data.aligner(file=os.path.basename(list_file), filter=options.filter, experiment_to_filter=options.experiment) if options.output == 'RTendalign': data.calculate(details=True, ntotal=False, nmax=False) data.RT_aligner(filter=options.filter, experiment_to_align=options.experiment) if options.output == "ratio": data.calculate(details=False, ntotal=True, nmax=True, pscounts=True) if options.ntotal == True: data.ratio(to_divide=options.to_divide, divisor=options.divisor, exp_to_use='_ntotal', filter=options.filter) if options.select: data.ratio(to_divide=options.to_divide, divisor=options.divisor, exp_to_use='_ntotal', select=options.select, filter=options.filter) if options.nmax == True: data.ratio(to_divide=options.to_divide, divisor=options.divisor, exp_to_use='_nmax', filter=options.filter) data.ratio(to_divide=options.to_divide, divisor=options.divisor, filter=options.filter) if options.output == "makeRTGTF": data.find_peaks() data.makeRTGTF() if options.output == "table": data.table(filter=options.filter, experiment_to_filter=options.experiment) if options.output == "Tdensity": data.find_peaks() # data.calculate() data.Tdensity(peak_min=options.peak_min, size=options.peak_size) if options.output == "makeGTF": data.find_peaks() data.maketranscriptGTF() if options.output == "transcript_length": data.find_peaks() data.printTrancriptLength() print '# Done.'
metavar="FILE", default=None) files.add_argument("-w", dest="window", help="size of sliding window. Default = 10", type=int, default=10) files.add_argument("-a", dest="aa_type", help="Type of filter applied i.e. b_a_70 = basic aminoacides above or equal 70% within window " "or ar_b_20 = aromatic aminoacids below 20% within window. Option for position 1: positive, negative, charged, polar, hydrophobic, aromatic" "Options for position 2: a - above or equal. Position 3 is percent within sliding window i.e 20 = 2/10 or 3/15" "Not used when -c used", type=str, default=None) files.add_argument("-c", dest="config_list", help="Config.list. Default=False", metavar="FILE", default=None) files.add_argument("--id", dest="id_given", help="gene ID given instead of gene names", action="store_true", default=False) args = parser.parse_args() #reading GTF file to GTF parser and creating id_to_gene list gtf = GTF2.Parse_GTF() gtf.read_GTF(gtm.getGTF(args.gtf_file)) id_to_gene = dict() for gene_name in gtf.genes: gene_id = gtf.genes[gene_name]['gene_id'] id_to_gene[gene_id] = gene_name #reading fasta file in_seq_handle = open(args.fasta_file) seq_dict = SeqIO.to_dict(SeqIO.parse(in_seq_handle, "fasta")) #dictionary with in_seq_handle.close() seq_dict_keys = seq_dict.keys() #function to check -a or config.list syntax def filter_parser(input_str): #checks
#seting up option parser parser = argparse.ArgumentParser(description='Usage: ruffus scirpt designed to make concat file from *.novo files. Make new folder, cp or ln into all novofiles and run novo2concat. IMPORTANT: name of novo file should be name of experiment') parser.add_argument("-g", "--gtf_file", dest="gtf_file", help="Provide the path to your gtf file.", type=str, default=None) parser.add_argument("-t", "--tab_file", dest="tab_file", help="Provide the path to your tab genome file.", type=str, default=None) parser.add_argument("-r", dest="ranges", help="Set up ranges for pyPileup. Default = 250", default=250) parser.add_argument("--3end", dest="three_end", help="Use pyPileup option --3end to only report counts for the 3' end of the reads. Default = False", action="store_true", default=False) parser.add_argument("-l", dest="list_file", help="Provide the FULL path to your gene_names.list file.", type=str, default=None, required=True) parser.add_argument("--tree", dest="tree", help="If you want to leave tree of catalogs including pilups within. Default = None.", action="store_true", default=False) parser.add_argument("-p", dest="prefix", help="Prefix for concat file name", type=str, default="") args = parser.parse_args() gtf, tab, ranges = gtm.getGTF(args.gtf_file), gtm.getTAB(args.tab_file), str(args.ranges) print "Using GTF file: " + gtf print "Using TAB genome file: " + tab #listing novo files files = [f for f in os.listdir('.') if os.path.isfile(f) and f.endswith('.novo')] #gives list of files in current directory directories = [re.sub(r'.novo$', '', d) for d in files] links = [] root_dir = os.getcwd() #making directories for f, d in zip(files, directories): os.mkdir(d) os.chdir(d) subprocess.call('ln -s ../' + f + ' ' + f, shell=True) links.append(os.path.abspath('./'+f))
parser.add_argument("-r", dest="ranges", help="Set up ranges for pyPileup. Default = 250", default=250) parser.add_argument("--3end", dest="three_end", help="Use pyPileup option --3end to only report counts for the 3' end of the reads. Default = False", action="store_true", default=False) parser.add_argument("--5end", dest="five_end", help="Use pyPileup option --5end to only report counts for the 5' end of the reads. Default = False", action="store_true", default=False) parser.add_argument("-l", dest="list_file", help="Provide the FULL path to your gene_names.list file.", type=str, default=None, required=True) parser.add_argument("--tree", dest="tree", help="If you want to leave tree of catalogs including pilups within. Default = None.", action="store_true", default=False) parser.add_argument("--anti", dest="anti", help="Create additional concat file with antisense reads Default = None.", action="store_true", default=False) parser.add_argument("-p", dest="prefix", help="Prefix for concat file name", type=str, default="") args = parser.parse_args() gtf, tab, ranges = gtm.getGTF(args.gtf_file), gtm.getTAB(args.tab_file), str(args.ranges) print "Using GTF file: " + gtf print "Using TAB genome file: " + tab #listing novo files files = [f for f in os.listdir('.') if os.path.isfile(f) and f.endswith('.novo')] #gives list of files in current directory directories = [re.sub(r'.novo$', '', d) for d in files] links = [] root_dir = os.getcwd() #making directories for f, d in zip(files, directories): os.mkdir(d) os.chdir(d) subprocess.call('ln -s ../' + f + ' ' + f, shell=True) links.append(os.path.abspath('./'+f))
parser = argparse.ArgumentParser(usage=usage, formatter_class=RawTextHelpFormatter) files = parser.add_argument_group('Options for input files') files.add_argument("-f", dest="fasta_file", help="Provide the path to your fasta file", metavar="FILE", default=None) files.add_argument("-g", "--gtf_file", dest="gtf_file", help="Provide the path to your gtf file.", metavar="FILE", default=None) files.add_argument("-c", dest="codone", help="codone that want to count", type=str, default='CGA') files.add_argument("--all", dest="save_matrix", help="Saves number of all codones as a matrix. Default=False", action="store_true", default=False) files.add_argument("--id", dest="id_given", help="gene ID given instead of gene names", action="store_true", default=False) args = parser.parse_args() gtf = GTF2.Parse_GTF() gtf.read_GTF(gtm.getGTF(args.gtf_file)) id_to_gene = dict() for gene_name in gtf.genes: gene_id = gtf.genes[gene_name]['gene_id'] id_to_gene[gene_id] = gene_name gene_name = id_to_gene[gene_id] in_seq_handle = open(args.fasta_file) seq_dict = SeqIO.to_dict(SeqIO.parse(in_seq_handle, "fasta")) in_seq_handle.close() seq_dict_keys = seq_dict.keys() matrix = pd.DataFrame()
def mRNA(): usage = "Usage: To create input concat file run novo2concat.py" parser = argparse.ArgumentParser(usage=usage) files = parser.add_argument_group('Options for input files') files.add_argument("-g", "--gtf_file", dest="gtf_file", help="Provide the path to your gtf file.", type=str, default=None) files.add_argument("-i", "--input_file", dest="input_file", help="Provide the path to your concat file. REQUIRED.", metavar="FILE", default=None, required=True) files.add_argument("--5flank", dest="five_prime_flank", type=int, help="Set up 5 prime flank in pileup file. Default = 0", default=0) files.add_argument("--3flank", dest="three_prime_flank", type=int, help="Set up 3 prime flank in pileup file. Default = 0", default=0) universal = parser.add_argument_group('Universal options') universal.add_argument("-t", "--hits_threshold", dest="hits_threshold", type=int, help="Set up threshold for pileup. Default 0 reads", default=0) universal.add_argument( "-n", "--normalized", dest="normalized", action="store_true", help= "Use when you want to work on data normalized 'reads per Milion'. Default: False", default=False) output = parser.add_argument_group('Options for output files') output.add_argument( "-p", "--prefix", dest="out_prefix", type=str, help= "Prefix for output files. Default to standard output. Not supported for -o ratio.", default=None) output.add_argument("-o", dest="output_files", choices=['bind'], help="Select from following options:" "(1) Print binding windows in fasta file", default="bind") output.add_argument("--peaks", dest="print_peaks", action="store_true", help="print peaks on plots. Default: False", default=False) output.add_argument("--valleys", dest="print_valleys", action="store_true", help="print valleys on plots. Default: False", default=False) special = parser.add_argument_group('Special options for some -o choices') special.add_argument( "--lookahead", dest="lookahead", type=int, help="Set up lookahead parameter for pypeaks function. Default = 20", default=20) special.add_argument( "-w", "--window", dest="window", type=int, help= "Set up size of window for bind calculation (-o bind). Default: 10", default=10) special.add_argument("-e", dest="experiment_to_use", type=str, help="For -o bind, which experiment to use.") # special.add_argument("--ntotal", dest="ntotal", action="store_true", help="Normalize data to sum of all reads (sum = 1). Default: False", default=False) # special.add_argument("--nmax", dest="nmax", action="store_true", help="Normalize data to maximal value (max = 1). Default: False", default=False) # special.add_argument("-a", dest="to_divide", type=str, help="experiment to divide by -b (-o fig_ratio)", # default=None) # special.add_argument("-b", dest="divisor", type=str, help="experiment being divisor for -a (-o fig_ratio)", # default=None) options = parser.parse_args() #checking input input_file = options.input_file #preparing naming of output files if options.out_prefix: prefix = options.out_prefix + '_' else: prefix = str() if options.normalized == True: prefix = 'normalized_' + prefix data = mRNAFromConcat(gtf_file=gtm.getGTF(options.gtf_file), five_prime_flank=options.five_prime_flank, three_prime_flank=options.three_prime_flank, hits_threshold=options.hits_threshold, lookahead=options.lookahead, prefix=prefix, npM=options.normalized) if options.output_files == "bind": #reading csv file data.read_csv(input_file, use='deletions') #calculating readthrough, details, normalize # data.calculate(details=options.details, ntotal=options.ntotal, nmax=options.nmax) data.bind(exp_to_use=options.experiment_to_use, window=options.window) print '# Done.'
def getGeneNamesFromGTF(): parser = OptionParser( usage="getGenesNames; type usage: %prog [options] -f filename") files = OptionGroup(parser, "File input options") files.add_option( "-f", "--input_file", dest="gtf_file", help= "Provide the path to your gtf data file. Default is standard input.", type="str", default=None) files.add_option( "-g", "--genes", dest="genes", help= "Which biotype of features to get: mRNA, tRNA, rRNA, snRNA, snoRNA", type="str", default='tRNA') files.add_option( "-i", "--introns", dest="introns", help= "Introns? both - not discriminate; int_cont -only intron containing; int_less - only int less", choices=["both", "int_cont", "int_less"], default="both"), files.add_option( "-o", "--output_file", dest="output_file", help= "Use this flag to provide an output file name. Default is standard output.", default=None) parser.add_option_group(files) (options, args) = parser.parse_args() ### By default, input and output are expected from the standard input or standard output. signal(SIGPIPE, SIG_DFL) outfile = sys.stdout if options.output_file: outfile = open(options.output_file, "w") gtf = GTF2.Parse_GTF() gtf.read_GTF(gtm.getGTF(options.gtf_file)) names_list = list() ### for loop extracting tRNA names for line in open(gtm.getGTF(options.gtf_file), "r"): if not line.startswith('#'): line_elements = line.strip().split('\t') # assert len(line_elements) == 10, 'Unexpected number of elements found in gtf line: ' + line if str(line_elements[1]) == options.genes: try: name = re.search("gene_name\s\"(.*?)\"", str(line_elements[8])).group(1) except: pass # name = re.search("gene_id\s\"(.*?)\"", str(line_elements[8])).group(1) if options.introns == "both": if name not in names_list: names_list.append(name) # outfile.write(str(name) + '\n') elif options.introns == "int_cont": if gtf.intronCoordinates(name): if name not in names_list: names_list.append(name) # outfile.write(str(name) + '\n') elif options.introns == "int_less": if not gtf.intronCoordinates(name): if name not in names_list: names_list.append(name) # outfile.write(str(name) + '\n') outfile.write('\n'.join(names_list) + '\n') outfile.close()