def test_run(self): gs = gelscore.Sequence('RGGGNGRGGRGRGGPMGRGGYGGGGSGGGGRGGFPSGGGGGGGQQRAGDWKCPNPTCENMNFSWRNECNQCKAPKPDGPGGGPGGSHMGGN') dist = gelscore.SimilarityWeight(sim_add=1.0) rf = gelscore.NormalizedCutRegionFinder(dist, min_region_size=6, score_threshold=0.1) res = rf.find(gs) #printRegions(res) self.assertTrue(len(res) == 3)
def test_run(self): gs = gelscore.Sequence('GGGGGGGGGGGGGGGGGGGGGGGGGGGGGG') n = len(gs) #dist = gelscore.SimilarityDistanceWeight(sim_add=1.0, sd=10.0) sim = gelscore.SimilarityWeight(sim_add=1.0) W = sim.weights(gs) score = sim.score(W) self.assertAlmostEqual(score, 1.0)
def test_run(self): gs = gelscore.Sequence('MASNDYTQQATQSYGAYPTQPGQGYSQQSSQPYGQQSYSGYSQSTDTSGYGQSSYSSYGQSQNTGYGTQSTPQGYGSTGGYGSSQSSQSSYGQQSSYPGYGQQPAPSSTSGSYGSSSQSSSYGQPQSGSYSQQPSYGGQQQSYGQQQSYNPPQGYGQQNQYNSSSGGGGGGGGGGNYGQDQSSMSSGGGSGGGYGNQDQSGGGGSGGYGQQDRGGRGRGGSGGGGGGGGGGYNRSSGGYEPRGRGGGRGGRGGMGGSDRGGFNKFGGPRDQGSRHDSEQDNSDNNTIFVQGLGENVTIESVADYFKQIGIIKTNKKTGQPMINLYTDRETGKLKGEATVSFDDPPSAKAAIDWFDGKEFSGNPIKVSFATRRADFNRGGGNGRGGRGRGGPMGRGGYGGGGSGGGGRGGFPSGGGGGGGQQRAGDWKCPNPTCENMNFSWRNECNQCKAPKPDGPGGGPGGSHMGGNYGDDRRGGRGGYDRGGYRGRGGDRGGFRGGRGGGDRGGFGPGKMDSRGEHRQDRRERPY') #gs = gelscore.Sequence('AAAAAAAAAAAAAYYGSGSGSGSGSAAAAAAAAA') dist = gelscore.SimilarityWeight(sim_add=1.0) rf = gelscore.NormalizedCutRegionFinder(dist, min_region_size=10, score_threshold=0.025) res = rf.find(gs) #printRegions(res) self.assertTrue(len(res) >= 7)
def test_run(self): gs = gelscore.Sequence('AAAAAAAAAAAAGGGGGGGGGGG') dist = gelscore.SimilarityWeight(sim_add=1.0) max_regions = 4 comp = gelscore.SequenceCompositionSimilarity() rf = gelscore.NormalizedCutRegionFinder(dist, min_region_size=6, score_threshold=0.9, max_regions=max_regions) res = rf.find(gs) self.assertTrue(len(res) < max_regions) self.assertTrue(len(res) == 2)
def test_run(self): # DAD: this is not a passing test right now, doesn't split into 3 gs = gelscore.Sequence('GGGGGGGGGGGGGGGGGGGGGGGGGGGGGGACDEFGHIKLMNPQRSTVWYGGGGGGGGGGGGGGGGGGGGGGGGGGGGGG') dist = gelscore.SimilarityWeight(sim_add=1.0) max_regions = 16 rf = gelscore.NormalizedCutRegionFinder(dist, min_region_size=6, score_threshold=0.01, max_regions=max_regions) res = rf.find(gs) printRegions(res) self.assertTrue(len(res) == 3)
def test_run(self): gs = gelscore.Sequence('GGGGGGGGGGGGGGGGGGGGGGGGGGGGGG') #dist = gelscore.SimilarityDistanceWeight(sim_add=1.0, sd=10.0) dist = gelscore.SimilarityWeight(sim_add=1.0) max_regions = 16 rf = gelscore.NormalizedCutRegionFinder(dist, min_region_size=6, score_threshold=0.3, max_regions=max_regions) res = rf.find(gs) #for r in res: # print r.region.start, r.region.end, r.score, r.norm_score, r.region self.assertTrue(len(res) == 1)
def test_run(self): gs = gelscore.Sequence('GGGGGGGGGGAAAAAAAAAAPPPPPPPPPP') sim = gelscore.SimilarityWeight(sim_add=1.0) rf = gelscore.NormalizedCutRegionFinder(sim, min_region_size=5, score_threshold=0.8) D = sim.weights(gs) #print D #print D.sum() #print D[1,:].sum() #print D[:,1].sum() res = rf.find(gs) #print "" #for r in res: # print r.score, r.norm_score, r.region self.assertTrue(len(res) == 3)
# Read input data = {} headers = None if not os.path.isfile(options.in_fname): raise IOError("# Error: file {} does not exist".format( options.in_fname)) inf = file(options.in_fname, 'r') dlr = util.DelimitedLineReader(inf, header=True) headers = dlr.headers for flds in dlr.dictentries: data[flds['id']] = flds inf.close() # Distance measure dist = gelscore.SimilarityWeight(sim_add=1.0, max_distance=200) # Arrange the data keys = sorted(data.keys()) for k in keys: dat = data[k] seq = dat['sequence'] sim = selfSimilarity(seq, dist) # Write output n_written = 0 # Tack sequence on the end, for better readability columns = headers[:] columns.remove('sequence') data_outs.write('\t'.join(columns) + "\tp.value\tp.value.adj\tsequence\n")
orf_gene_dict = dict([(v, k) for (k, v) in gene_orf_dict.items()]) # Select which genes to process query_keys = [] if not options.query_orf is []: # Specific ORF(s) query_keys += options.query_orf if not options.query_gene is []: # Specific gene(s) query_keys += [gene_orf_dict[k] for k in options.query_gene] if len(query_keys) == 0: # Go through all proteins in database query_keys = all_keys # Distance definition dist = gelscore.SimilarityWeight(sim_add=1.0, max_dist=options.max_distance) # Region finder reg_finder = gelscore.NormalizedCutRegionFinder( dist, min_region_size=options.min_region_size, score_threshold=options.score_threshold, max_regions=options.max_regions) # Remove gaps? if options.degap: for k in query_keys: prot_dict[k] = prot_dict[k].replace("-", '') if options.debugging: query_keys = query_keys[0:min(len(query_keys, 100))]