def getGenesInLongRangeWindows(annotations,regions_collection,shift): gene_regions = {} for geneID in annotations.keys(): gene_regions[geneID]=SortedCollection(key=itemgetter(1)) left_Border=annotations[geneID][1][0]-shift right_Border=annotations[geneID][1][0]+shift chromosome=annotations[geneID][0] if (chromosome in regions_collection): selectedLoops = regions_collection[chromosome] try: left_item = selectedLoops.find_lt(left_Border) except ValueError: try: left_item = selectedLoops.find_ge(left_Border) except ValueError: left_item = None else: if left_item[2] < left_Border: try: left_item = selectedLoops.find_ge(left_Border) except ValueError: left_item = None try: right_item = selectedLoops.find_le(right_Border) except ValueError: right_item = None # Check if target interval is valid if left_item is not None and right_item is not None: left_index = selectedLoops.index(left_item) right_index = selectedLoops.index(right_item) if left_index <= right_index: # Copy regions in target interval for i in xrange(left_index, right_index + 1): gene_regions[geneID].insert_right(selectedLoops[i]) return gene_regions
def read_Intra_Loop_Regions_Collection(filename): left_region_collection = {} right_region_collection = {} with open(filename) as hi_c_file: loopID=0 for line in hi_c_file: line = line.split() if (line[0] == line[3]): loopID += 1 chromosome = line[0].replace("chr","") if chromosome not in left_region_collection: left_region_collection[chromosome] = SortedCollection(key=itemgetter(1)) if chromosome not in right_region_collection: right_region_collection[chromosome] = SortedCollection(key=itemgetter(1)) left_region_collection[chromosome].insert_right((loopID, int(line[1]), int(line[2]), int(line[4]), int(line[5]))) right_region_collection[chromosome].insert_right((loopID, int(line[4]), int(line[5]), int(line[1]), int(line[2]))) return left_region_collection, right_region_collection
def merge_gene_regions(gene_regions, add_gene_regions,annotation): for geneID in annotation.keys(): if geneID not in gene_regions.keys(): gene_regions[geneID] = SortedCollection(key=itemgetter(1)) if geneID in add_gene_regions.keys(): for region in add_gene_regions[geneID]: try: gene_regions[geneID].find(region[1]) except ValueError: gene_regions[geneID].insert_right(region) return gene_regions
def __init__(self, capacity, start_key=lambda o: o[0], length_key=lambda o: o[1]): """ @param key: A function that fetches the range start from an item. """ super(RangeCache, self).__init__() self._ranges = SortedCollection(key=start_key) self._lru = BoundedLRUQueue(capacity, key=start_key) self._start_key = start_key self._length_key = length_key
def get_stats(self): items = self.get_items(False) read = filter(operator.attrgetter("is_read"), items) unread = filter(operator.attrgetter("is_unread"), items) read_sorted = SortedCollection(read, operator.attrgetter('time_read')) unread_sorted = SortedCollection(unread, operator.attrgetter('time_added')) # find items read less than a week ago now = datetime.datetime.now() _7_days_ago = now + relativedelta(days=-7) _30_days_ago = now + relativedelta(days=-30) print self.render( "report.txt", total=len(items), total_read=len(read), total_unread=len(unread), now=now, newly_added_7d=self._get_items_since(unread_sorted, _7_days_ago), newly_read_7d=self._get_items_since(read_sorted, _7_days_ago), newly_added_30d=self._get_items_since(unread_sorted, _30_days_ago), newly_read_30d=self._get_items_since(read_sorted, _30_days_ago))
def ordered_traversal(dependency_list): forward_dependencies = build_forward_dependencies(dependency_list) reverse_dependencies = build_reverse_dependencies(forward_dependencies) no_dependencies = find_no_dependencies(reverse_dependencies) cur_no_dependencies = SortedCollection(no_dependencies, reverse=True) answer = [] while cur_no_dependencies: value = cur_no_dependencies.pop() del reverse_dependencies[value] answer.append(value) for dependencies in reverse_dependencies.values(): dependencies.discard(value) for ready in find_no_dependencies(reverse_dependencies): cur_no_dependencies.maybe_insert(ready) return answer
def __init__(self, ranges=tuple()): # Sort by the start of every range: self._ranges = SortedCollection(ranges, itemgetter(0)) if ranges: self._consolidate() self.begin = self.start = self._ranges[0][0] self.end = self.stop = self._ranges[-1][1] self.span = self.end - self.begin + 1 self.coverage = sum(end - begin + 1 for (begin, end) in self._ranges) else: self.begin = self.start = self.end = self.stop = None self.span = self.coverage = 0
def readOC_Region(filename): tfpa=open(filename,"r") tfpa.readline() oC={} counter=1 for l in tfpa: s=l.split()[0] ds=s.split(":") if (len(ds)>=2): chrom=ds[0].replace("chr","") se=ds[1].split("-") if chrom not in oC: oC[chrom]=SortedCollection(key=itemgetter(1)) oC[chrom].insert_right((counter,int(se[0]),int(se[1]))) counter+=1 tfpa.close() return oC
def _consolidate(self): new_ranges = SortedCollection(key=itemgetter(0)) prev_begin, prev_end = self._ranges[0] for begin, end in self._ranges[1:]: if prev_end >= begin - 1: # Consolidate the previous and current ranges: prev_end = max(prev_end, end) else: # Add the previous range, and continue with the current range # as the seed for the next iteration: new_ranges.insert((prev_begin, prev_end)) prev_begin = begin prev_end = end new_ranges.insert((prev_begin, prev_end)) self._ranges = new_ranges
def get_intersecting_regions(a_regions, b_collection): intersection_a = SortedCollection(key=itemgetter(1)) for a_region in a_regions: try: left_boundary = b_collection.find_lt_index(a_region[1]) except ValueError: try: left_boundary = b_collection.find_ge_index(a_region[1]) except ValueError: left_boundary = len(b_collection) else: if b_collection[left_boundary][2] < a_region[1]: left_boundary += 1 curr_index = left_boundary if curr_index < len(b_collection) and b_collection[curr_index][1] <= a_region[2]: intersection_a.insert_right(a_region) curr_index += 1 b_collection.key=itemgetter(3) for a_region in a_regions: try: left_boundary = b_collection.find_lt_index(a_region[1]) except ValueError: try: left_boundary = b_collection.find_ge_index(a_region[1]) except ValueError: left_boundary = len(b_collection) else: if b_collection[left_boundary][4] < a_region[1]: left_boundary += 1 curr_index = left_boundary if curr_index < len(b_collection) and b_collection[curr_index][3] <= a_region[2]: intersection_a.insert_right(a_region) curr_index += 1 b_collection.key=itemgetter(1) return intersection_a
def main(): ''' Sets arguments and subarguments for running the program, and reads in files for organism specified. If config_file being used, reads that in too. ''' parser = argparse.ArgumentParser(usage=MAINUSEAGE) parser.add_argument("--hits-folder", default=".") parser.add_argument("--output-folder", default=".") parser.add_argument("--domains", default="highlighted", choices=[GENES_ALL, GENES_HIGHLIGHTED, GENES_NONE]) parser.add_argument("--direction", default="highlighted", choices=[GENES_ALL, GENES_HIGHLIGHTED, GENES_NONE]) parser.add_argument("--organism", default="Calb", choices=["Calb", "Scer", "Spom"]) parser.add_argument("--absolute-pixel-size", type=int, default=0) parser.add_argument("--rna-bam") gene_list_parser = lambda gs: [g for g in gs.split(',')] subparsers = parser.add_subparsers(dest="source_type") region_parser = subparsers.add_parser("region", usage=REGIONUSAGE) region_parser.add_argument("--chromosome", required=True) region_parser.add_argument("--start", type=int, required=True) region_parser.add_argument("--stop", type=int, required=True) region_parser.add_argument("--genes", type=gene_list_parser, default="") gene_parser = subparsers.add_parser("gene", usage=GENEUSAGE) gene_name = gene_parser.add_mutually_exclusive_group(required=True) # TODO: can we make --genes not be named, and instead come at the end of the parser? gene_name.add_argument("--genes", type=gene_list_parser) gene_region = gene_parser.add_mutually_exclusive_group() gene_region.add_argument("--percent-of-length", type=float, default=0.2) gene_region.add_argument("--bps", type=int) gene_parser.add_argument("--exclude-genes", type=gene_list_parser, default="") config_file_parser = subparsers.add_parser("config_file", usage=CONFIGUSAGE) config_file_parser.add_argument("config_file") args = parser.parse_args() if args.organism == "Calb": hits = SummaryTable.read_hit_files(glob.glob(os.path.join(args.hits_folder, "*_Hits.txt"))) elif args.organism == "Scer": import cPickle all_track_files = glob.glob(os.path.join(args.hits_folder, "*.wig")) # We cache the hits because the hit reading process involves finding the # feature which got hit, for every hit, and that makes reading an O(n log n) # operation, which is a little slow. O(n) is better here. hit_cache = os.path.join(args.hits_folder, "cached_sc_track_hits.dat") if not os.path.exists(hit_cache): all_tracks = [SummaryTable.get_hits_from_wig(fname) for fname in all_track_files] with open(hit_cache, 'wb') as pickle_file: cPickle.dump(all_tracks, pickle_file) else: with open(hit_cache, 'rb') as pickle_file: all_tracks = cPickle.load(pickle_file) hits = all_tracks elif args.organism == "Spom": hits = [SummaryTable.read_pombe_hit_file("/Users/bermanlab/ngs-bench/Hermes/SRR327340.trimmed.trail_q_20.sorted_Hits.csv")] # Process hits for quicker access by chromosome name and position: new_hits = [] db = _get_organism(args.organism).feature_db # TODO: we manipulate the chromosome names to reflect the standard names, # but this should be done in the hit-reading functions. chrom_names = db._chrom_names # TODO: don't do this, _chrom_names are protected! for hit_track in hits: new_hit_track = {chrom.name: SortedCollection(key=lambda h: h["hit_pos"]) for chrom in db} for hit in hit_track: chrom = chrom_names[hit["chrom"]] hit["chrom"] = chrom new_hit_track[chrom].insert(hit) new_hits.append(new_hit_track) hits = new_hits if not hits: raise Exception("No hit files were found in the hits folder: %s" % args.hits_folder) if args.source_type == "config_file": with open(args.config_file, "r") as in_file: # Read the file once, in case it changes in the middle of the run: for line in in_file.readlines(): line = line.strip() if not line or line.startswith("#"): continue handle_args(parser.parse_args(shlex.split(line)), hits) else: handle_args(args, hits)
def addFile(filename, games): with open(filename, 'r') as csvfile: reader = csv.DictReader(csvfile) for row in reader: row['start'] = parser.parse(row['start']) row['end'] = parser.parse(row['end']) try: item = games.find_le(row['start']) if item['end'] < row['start']: games.insert(row) else: print "Skipping game: %s,%s,%s it conflicts with %s,%s,%s" % (row['name'], row['start'], row['end'], item['name'], item['start'], item['end']) except ValueError: games.insert(row) if __name__ == "__main__": arguments = docopt(__doc__, version='Merge 0.1') mergedGames = SortedCollection([], key=lambda k: k['start']) for filename in arguments["<CSVfiles>"]: addFile(filename, mergedGames) with open(arguments["<outCSV>"],"w") as csvfile: writer = csv.DictWriter(csvfile, fieldnames=["team","name", "start", "end", "band", "freq"]) writer.writeheader() for game in mergedGames: writer.writerow(game)
def convertTrainingData(): data = { 'articleTrain.csv': [[ 'yUp', 'yDown', 'yGetCom', 'yCreateCom', 'yGetSub', 'yCreateSub', 'id', 'rating', 'viewCount', 'upVoteCount', 'downVoteCount', 'getCommentsCount', 'createComment', 'notCommentRating', 'PgetComment', 'PcreateComment', 'Pup', 'Pdown', 'getSubarticlesCount', 'createSubarticle', 'notSubarticleRating', 'PgetSub', 'PcreateSub' ]], 'articleTest.csv': [[ 'yUp', 'yDown', 'yGetCom', 'yCreateCom', 'yGetSub', 'yCreateSub', 'id', 'rating', 'viewCount', 'upVoteCount', 'downVoteCount', 'getCommentsCount', 'createComment', 'notCommentRating', 'PgetComment', 'PcreateComment', 'Pup', 'Pdown', 'getSubarticlesCount', 'createSubarticle', 'notSubarticleRating', 'PgetSub', 'PcreateSub' ]], 'articleCV.csv': [[ 'yUp', 'yDown', 'yGetCom', 'yCreateCom', 'yGetSub', 'yCreateSub', 'id', 'rating', 'viewCount', 'upVoteCount', 'downVoteCount', 'getCommentsCount', 'createComment', 'notCommentRating', 'PgetComment', 'PcreateComment', 'Pup', 'Pdown', 'getSubarticlesCount', 'createSubarticle', 'notSubarticleRating', 'PgetSub', 'PcreateSub' ]], 'subarticleTrain.csv': [[ 'yUp', 'yDown', 'yGetCom', 'yCreateCom', 'yGetSub', 'yCreateSub', 'id', 'rating', 'viewCount', 'upVoteCount', 'downVoteCount', 'getCommentsCount', 'createComment', 'notCommentRating', 'PgetComment', 'PcreateComment', 'Pup', 'Pdown' ]], 'subarticleTest.csv': [[ 'yUp', 'yDown', 'yGetCom', 'yCreateCom', 'yGetSub', 'yCreateSub', 'id', 'rating', 'viewCount', 'upVoteCount', 'downVoteCount', 'getCommentsCount', 'createComment', 'notCommentRating', 'PgetComment', 'PcreateComment', 'Pup', 'Pdown' ]], 'subarticleCV.csv': [[ 'yUp', 'yDown', 'yGetCom', 'yCreateCom', 'yGetSub', 'yCreateSub', 'id', 'rating', 'viewCount', 'upVoteCount', 'downVoteCount', 'getCommentsCount', 'createComment', 'notCommentRating', 'PgetComment', 'PcreateComment', 'Pup', 'Pdown' ]], 'commentTrain.csv': [[ 'yUp', 'yDown', 'yGetCom', 'yCreateCom', 'yGetSub', 'yCreateSub', 'id', 'rating', 'viewCount', 'upVoteCount', 'downVoteCount', 'getCommentsCount', 'createComment', 'notCommentRating', 'PgetComment', 'PcreateComment', 'Pup', 'Pdown' ]], 'commentTest.csv': [[ 'yUp', 'yDown', 'yGetCom', 'yCreateCom', 'yGetSub', 'yCreateSub', 'id', 'rating', 'viewCount', 'upVoteCount', 'downVoteCount', 'getCommentsCount', 'createComment', 'notCommentRating', 'PgetComment', 'PcreateComment', 'Pup', 'Pdown' ]], 'commentCV.csv': [[ 'yUp', 'yDown', 'yGetCom', 'yCreateCom', 'yGetSub', 'yCreateSub', 'id', 'rating', 'viewCount', 'upVoteCount', 'downVoteCount', 'getCommentsCount', 'createComment', 'notCommentRating', 'PgetComment', 'PcreateComment', 'Pup', 'Pdown' ]] } arts = articles[:] random.shuffle(arts) cvLength = int(0.05 * len(arts)) testLength = int(0.10 * len(arts)) cvArts = SortedCollection(arts[0:cvLength], key=itemgetter('id')) testArts = SortedCollection(arts[cvLength:(cvLength + testLength)], key=itemgetter('id')) trainArts = SortedCollection(arts[(cvLength + testLength):], key=itemgetter('id')) cvSubs = SortedCollection([], key=getId) testSubs = SortedCollection([], key=getId) trainSubs = SortedCollection([], key=getId) cvComs = SortedCollection([], key=getId) testComs = SortedCollection([], key=getId) trainComs = SortedCollection([], key=getId) def contains(lst, Id): try: if lst.find(str(Id)): return True else: return False except ValueError: return False for sub in Subarticles.find(): if contains(testArts, sub['parentId']): testSubs.insert(sub) elif contains(cvArts, sub['parentId']): cvSubs.insert(sub) else: trainSubs.insert(sub) replies = [] for comment in Comments.find(): if comment['commentableType'] == 'article': if contains(cvArts, comment['commentableId']): cvComs.insert(comment) elif contains(testArts, comment['commentableId']): testComs.insert(comment) else: trainComs.insert(comment) elif comment['commentableType'] == 'subarticle': if contains(cvSubs, comment['commentableId']): cvComs.insert(comment) elif contains(testSubs, comment['commentableId']): testComs.insert(comment) else: trainComs.insert(comment) else: replies.append(comment) for comment in replies: if comment['commentableType'] == 'article': if contains(cvArts, comment['commentableId']): cvComs.insert(comment) elif contains(testArts, comment['commentableId']): testComs.insert(comment) else: trainComs.insert(comment) elif comment['commentableType'] == 'subarticle': if contains(cvSubs, comment['commentableId']): cvComs.insert(comment) elif contains(testSubs, comment['commentableId']): testComs.insert(comment) else: trainComs.insert(comment) elif comment['commentableType'] == 'comment': if contains(cvComs, comment['commentableId']): cvComs.insert(comment) elif contains(testComs, comment['commentableId']): testComs.insert(comment) else: trainComs.insert(comment) else: print "Comment on a comment!" print("cvArts: {}\tcvSubs: {}\tcvComs: {}").format(len(cvArts), len(cvSubs), len(cvComs)) print("testArts: {}\ttestSubs: {}\ttestComs: {}").format( len(testArts), len(testSubs), len(testComs)) print("trainArts: {}\ttrainSubs: {}\ttrainComs: {}").format( len(trainArts), len(trainSubs), len(trainComs)) sys.stdout.flush() views = Views.find().sort("_id", 1) viewLength = views.count() pbar = progressbar.ProgressBar(widgets=[ progressbar.Timer(), progressbar.ETA(), progressbar.Bar(), progressbar.Percentage() ], maxval=viewLength).start() processed = int(0) for view in views: pbar.update(processed) processed = processed + 1 x, y = prepTrainingSet(view) if len(x): dat = np.append(y, x).tolist() if (view['viewableType'] == 'article'): if contains(cvArts, view['viewableId']): data['articleCV.csv'].append(dat) elif contains(testArts, view['viewableId']): data['articleTest.csv'].append(dat) else: data['articleTrain.csv'].append(dat) elif (view['viewableType'] == 'subarticle'): if contains(cvSubs, view['viewableId']): data['subarticleCV.csv'].append(dat) elif contains(testSubs, view['viewableId']): data['subarticleTest.csv'].append(dat) else: data['subarticleTrain.csv'].append(dat) elif (view['viewableType'] == 'comment'): if contains(cvComs, view['viewableId']): data['commentCV.csv'].append(dat) elif contains(testComs, view['viewableId']): data['commentTest.csv'].append(dat) else: data['commentTrain.csv'].append(dat) else: print "Unknown viewableType: {}" pbar.finish() print "Writing results" for filename, lst in data.items(): with open(filename, 'wb') as csvfile: writer = csv.writer(csvfile) for line in lst: writer.writerow(line)
DownVotes = InteractionsDB.downVote Articles = ArticlesDB.article Subarticles = ArticlesDB.subarticle Comments = ArticlesDB.comment def getId(item): return str(item['_id']) def getViewId(item): return str(item['viewId']) print "caching downvotes" downVotes = SortedCollection(list(DownVotes.find().sort("viewId", 1)), key=getViewId) #downVotes = SortedCollection(list(DownVotes.find()), key=itemgetter('viewId')) print "caching upvotes" upVotes = SortedCollection(list(UpVotes.find().sort("viewId", 1)), key=getViewId) #upVotes = SortedCollection(list(UpVotes.find()), key=itemgetter('viewId')) #upVotes = SortedCollection([], key=itemgetter('viewId')) print "caching clicks" clicks = SortedCollection(list(Clicks.find().sort("viewId", 1)), key=getViewId) #clicks = SortedCollection(list(Clicks.find()), key=itemgetter('viewId')) #clicks = SortedCollection([], key=itemgetter('viewId')) def findGeneric(lst, Id): Id = str(Id) try: