def merge_videos(): global mer path_name_seg = [] for i in all_filename: path_name_seg.append( os.path.join(target_path, sel_res.getVideoTitle(), i)) mer = Merger( unicode( os.path.join( target_path, sel_res.getVideoTitle() + '.' + sel_res.getFileFormat())), path_name_seg) gui.frame_main.initTotal_Merge(len(all_filename)) mer.start() while True: gui.frame_main.updateMerge(mer.now) time.sleep(0.05) if mer.now == mer.sum: gui.frame_main.updateMerge(mer.now) break with open('config.ini', 'wb') as f: save_configure() dlg = wx.MessageDialog(gui.frame_main, u'视频已经合并完成,是否删除分段文件?', u'提示', wx.YES_NO | wx.ICON_QUESTION) if dlg.ShowModal() == wx.ID_YES: del_seg_video() dlg = wx.MessageDialog(gui.frame_main, u'分段文件删除完成。', u'提示', wx.OK | wx.ICON_QUESTION) dlg.ShowModal()
def process_multiple(log, do_fetch=True, do_parse=True, do_merge=True): root = config["data-dir"] if do_fetch: tokens = Tokens() api = API(tokens, log) util.delete_files(root + '/processing/invoices', '*.json') success, invoice_cnt = api.fetch_invoice_details(hours_delta=30, tz_offset=7) if success and invoice_cnt > 0: log.write( "INFO api invoices extraction succeeded {:,} invoices saved to : {}" .format(invoice_cnt, '/processing/invoices')) elif success and invoice_cnt == 0: log.write( "INFO api no invoices extracted (no new/updated invoices in refresh period)" ) return True else: log.write( "ERROR api invoices extraction failed {:,} invoices saved to : {}" .format(invoice_cnt, '/processing/invoices')) return False if do_parse: util.delete_files(root + '/processing/invoices', '*.csv') parser = Parser(log) parser.parse('invoices-line-items') if do_merge: merger = Merger(log) merger.merge_invoice_delta() return True
def test(test_case): merger = Merger(test_case) # codes = merger.get_codes(3) for k in range(10, 9, -1): value_sum = 0 for code, num in merger.merge_result(k).items(): value_sum += num print 'k =', k, merger.merge_result(k), ' \tMerged codes:', len(merger.merge_result(k)), ' \tTotal value:', value_sum
def __init__(self, *args, **kwargs): Merger.__init__(self, *args, **kwargs) self.learner_hash = kwargs.get('learner_hash', '') self.c_range = kwargs.get('c_range', [0.01, 0.7]) self.get_frontier = Frontier(self.c_range, 0.) self.CACHENAME = './dbwipes.rangemerger.cache' self.i = 0 self.yrange = None
def __init__(self, **kwargs): self.full_table = None self.bad_tables = [] self.good_tables = [] self.bad_err_funcs = [] self.good_err_funcs = [] self.err_func = None self.cols = None Merger.__init__(self, **kwargs)
def test_merge_simple(self): m = Merger() l = Layer("1") l.addChannel(1, 255) l.addChannel(2, 127) m.addLayer(l) m.merge() self.assertEqual(m.galaxy[1], 255) self.assertEqual(m.galaxy[2], 127)
def setup_stats(self, clusters): """ computes error bounds and the minimum volume of a 0-volume cluster adds data structures to cluster object """ Merger.setup_stats(self, clusters) for c in clusters: c.inf_func = c.create_inf_func(self.learner.l) c.c_range = list(self.c_range) c.inf_range = [c.inf_func(c.c_range[0]), c.inf_func(c.c_range[1])]
def setup_stats(self, clusters): """ computes error bounds and the minimum volume of a 0-volume cluster adds data structures to cluster object """ Merger.setup_stats(self, clusters) for c in clusters: c.inf_func = self.learner.create_inf_func(c) c.c_range = list(self.c_range) c.inf_range = [c.inf_func(c.c_range[0]), c.inf_func(c.c_range[1])]
def merge(self, corpus_size): """ The function will merge all the data in the posting files using the BSBI algorithm """ docs_file = self.get_docs_file() for key in self.postings_data: if os.listdir(self.postings_data[key]['path']): # directory is not empty merger = Merger(self.postings_data[key]['path'], "pkl", docs_file, corpus_size) merger.merge(self.postings_data[key]['name']) # The merger updates the docs data. After the merge of all the letters - all the documents data # Is updated and need to be saved on disk to reduce the memory load utils.save_obj(docs_file, f"{self.posting_dir_path}\\docs\\docs_index")
def get_IMPA_Merger(name): imp = iMPA(name) terc = imp.terc data = imp.getAddresses() s = min(map(lambda x: x.center.y, data)) w = min(map(lambda x: x.center.x, data)) n = max(map(lambda x: x.center.y, data)) e = max(map(lambda x: x.center.x, data)) addr = getAddresses(map(str, (s, w, n, e))) m = Merger(data, addr, terc) m.post_func.append(m.merge_addresses) m.merge() return m
def get_impa_merger(name): imp = iMPA(name) terc = imp.terc data = imp.get_addresses() s = min(map(lambda x: x.center.y, data)) w = min(map(lambda x: x.center.x, data)) n = max(map(lambda x: x.center.y, data)) e = max(map(lambda x: x.center.x, data)) addr = get_addresses(map(str, (s, w, n, e))) m = Merger(data, addr, terc, "%s.e-mapa.net" % name) m.post_func.append(m.merge_addresses) m.merge() return m
def test_merge_complete(self): m = Merger() l1 = Layer("1") l1.addChannel(2, 1) l1.addChannel(3, 255) l1.addChannel(4, 127) l2 = Layer("2") l2.addChannel(3, 0, 0.5) l2.addChannel(4, 255, "max") l2.addChannel(5, 255, "min") l3 = Layer("3") l3.addChannel(2, 255, 0.3) l4 = Layer("4") l4.addChannel(2, 127, 0.6) m.addLayer(l1) m.addLayer(l2) m.addLayer(l3) m.addLayer(l4) m.merge() self.assertEqual(m.galaxy[1], 0) self.assertEqual(m.galaxy[2], 107) self.assertEqual(m.galaxy[3], 128) self.assertEqual(m.galaxy[4], 255) self.assertEqual(m.galaxy[5], 0)
def __init__(self, visualizer=None, speaker_recognition=False): self.merger_to_main_queue = Queue(maxsize=1000) # very roughly 30sec self.merger = Merger(self.merger_to_main_queue) if visualizer is None: self.visualization = False else: self.visualization = True self.main_to_vis_queue = Queue(maxsize=50) self.visualizer = visualizer(self.main_to_vis_queue) self.speakers = {} self.num_speakers = 0 self.stt = T2t_stt() self.speaker_recognition = speaker_recognition # if self.speaker_recognition: # self.sr = Speaker_recognition() self.text_queue = mult_Queue() self.bing_allowed = False
def set_params(self, **kwargs): print kwargs.keys() self.cols = kwargs.get('cols', self.cols) self.full_table = kwargs.get('full_table', self.full_table) self.bad_tables = kwargs.get('bad_tables', self.bad_tables) self.good_tables = kwargs.get('good_tables', self.good_tables) self.bad_err_funcs = kwargs.get('bad_err_funcs', self.bad_err_funcs) self.good_err_funcs = kwargs.get('good_err_funcs', self.good_err_funcs) assert self.bad_tables is not None, "table not set" assert self.bad_err_funcs is not None, "error func not set" self.table = self.full_table domain = self.full_table.domain attrnames = [attr.name for attr in domain] self.cont_dists = dict(zip(attrnames, Orange.statistics.basic.Domain(self.full_table))) self.disc_dists = dict(zip(attrnames, Orange.statistics.distribution.Domain(self.full_table))) Merger.set_params(self, **kwargs)
def __init__(self, *args, **kwargs): Merger.__init__(self, *args, **kwargs) self.learner_hash = kwargs.get('learner_hash', '') self.c_range = kwargs.get('c_range', [0.01, 0.7]) self.get_frontier = Frontier(self.c_range, 0.) self.CACHENAME = './dbwipes.rangemerger.cache' self.i = 0 # # per execution state # # dim -> list of value subsets that were not on frontier # e.g., subregion -> [ (SR1, SR2), (SR3), ... ] self.rejected_disc_vals = defaultdict(list) # (dim, direction) -> range it has expanded along self.rejected_cont_vals = defaultdict(set)
def __init__(self, indent=None, to_explore=False): self.indent = indent self.indent_children = None self.content = [] self.parent = None self.to_explore = to_explore self.merger = Merger() self.padding = None self.sf = None self.sc = None
def ProcessRequest(file): name = str(uuid4()) base_file_name = "%s-%s" % (name, secure_filename(file.filename)) file_name = "tmp/%s" % base_file_name print(file_name) file.save(file_name) with ZipFile(file_name, 'r') as zipObj: zipObj.extractall("tmp/%s" % name) Merger("tmp/%s" % name, os.path.realpath("tmp/combined-%s.ics" % name))
def merge_videos(): _, res = iqiyi.getLastRes() path_name_seg = [] for i in all_filename: path_name_seg.append(os.path.join(video_title, i)) mer = Merger( unicode( os.path.join(target_path, video_title + '.' + res[sel_bid]['ff'])), path_name_seg) gui.frame_main.initTotal_Merge(len(all_filename)) mer.start() while True: gui.frame_main.updateMerge(mer.now) time.sleep(0.01) if mer.now == mer.sum: gui.frame_main.updateMerge(mer.now) break del_seg_video()
def merge(self): # self.text_dst.config(state = 'normal') text = self.text_src.get('1.0', END) # print text.encode('utf-8') codes2num = decode(text) # print codes2num self.merger = Merger(codes2num) self.text_dst.delete(0.0, END) result_text = '' for k in range(10, 3, -1): result_text += '最大长度' + str(k) + ' ' result_text += encode(self.merger.merge_result(k)) # print result_text self.text_dst.insert(END, result_text)
def dim_merge(self, cluster, dim, dec=None, inc=None, skip=None): if dec is not None: if round(dec, 1) in self.rejected_cont_vals[(dim, 'dec')]: return None if inc is not None: if round(inc, 1) in self.rejected_cont_vals[(dim, 'inc')]: return None merged = Merger.dim_merge(self, cluster, dim, dec, inc, skip) if merged: merged.c_range = list(self.c_range) merged.inf_func = self.learner.create_inf_func(merged) return merged
def merge(): try: if request.method == 'OPTIONS': return make_response(jsonify({"Allow":"POST"}), 200) if not request.json or not 'foreground_url' in request.json or not 'background_url' in request.json: abort(400) foreground_url = request.json['foreground_url'] background_url = request.json['background_url'] m = Merger(foreground_url, background_url) m.merge_images() response = { 'output_image':{ 'name': m.get_output_image('name'), 'url' : url_for('get_image', image_name = m.get_output_image('name'),_external=True), 'base64' : m.get_output_image('base64') } } return jsonify(response), 201 except Exception as e: err_msg = e.message if err_msg == '': err_msg = 'Internal Error. Please Try Again' return make_response(jsonify({'error': e.message}), 202)
def main(): scrapper = Scrapper() merger = Merger() parser = Parser() client = MongoClient('localhost', 27017) db = client['Data'] collection_socialmedia = db['socialmedia'] #Begin real time collecting while True: scrapper.scrap() merger.main() parser.main() sleep(3600) #Storing to mangoDB f = open( '/home/sartharion/Bureau/stage/POO/data.json', 'r') file_data = json.load(f) collection_socialmedia.delete_many({}) collection_socialmedia.insert_many(file_data) client.close()
def __init__(self, cldict, sampd): self.cldict = cldict self.sampd = sampd self.mergo = Merger(cldict, sampd) self.meto = Metrics(cldict) lbwao = None lbbmapo = None lref_acc_str = sampd.ref_acc_str if lref_acc_str != "none": lbwao = AlignerBwa(cldict, sampd) self.bwao = lbwao self.samfco = SamFC(cldict, sampd)
def merge_addr(terc): log_io = io.StringIO() logging.basicConfig(level=10, handlers=[logging.StreamHandler(log_io), ]) addr = get_addresses_terc(terc) m = Merger([], addr, terc, "emuia.gugik.gov.pl") m.create_index() m.merge_addresses() return make_response(m.get_incremental_result(log_io), 200)
def merge_addr(terc): logIO = io.StringIO() logging.basicConfig(level=10, handlers=[logging.StreamHandler(logIO),]) addr = json.loads(overpass.getAddresses(terc)) m = Merger([], addr, terc, "emuia.gugik.gov.pl") m._create_index() m.merge_addresses() return make_response(m.get_incremental_result(logIO), 200)
def do_patch(self, file): merger = Merger(file) compile = False for p in self.patches: if p._apply: compile = True print("Merging program %s for \"%s\"..." % (p.get_program(), p.name())) f_m = open(p.get_program(), 'r+b') merger.merge(f_m) f_m.close() print("Program %s for \"%s\" merged.\n" % (p.get_program(), p.name())) if compile: print("Compiling ...") merger.compile() print("") for p in self.patches: if p._apply: print("Patching \"%s\"..." % (p.name())) file = p.patch(file) print("") return file
def disc_merge(self, cluster, dim, vals, skip=None): # reject if union is a superset of anything in # rejected_disc_vals vals = set(vals) vals.update(cluster.discretes.get(dim, ())) for subset in self.rejected_disc_vals[dim]: if vals.issuperset(subset): return None merged = Merger.disc_merge(self, cluster, dim, vals, skip) if merged: merged.c_range = list(self.c_range) merged.inf_func = self.learner.create_inf_func(merged) return merged
def get_history(id: ObjectId, num_changes: int = None): hist = history.find({'ref': id}).sort('_id', direction=pymongo.DESCENDING) curr = data.find_one({'_id': id}) yield curr prev = curr count = 0 merger = Merger() for d in hist: if num_changes and count == num_changes: break d['ref_creation_time'] = d['_id'].generation_time del d['_id'] del d['ref'] l: dict = copy.deepcopy(prev) merger.merge_changes(l, d) yield l prev = l count += 1
def __init__(self, cldict, sampd): self.cldict = cldict self.sampd = sampd self.mergo = Merger(cldict, sampd) self.meto = Metrics(cldict) lbwao = None lbbmapo = None lref_acc_str = sampd.ref_acc_str lhost_ref_str = sampd.host_ref_str if lref_acc_str != "none": lbwao = AlignerBwa(cldict, sampd) if lhost_ref_str != "none": lbbmapo = AlignerBBMap(cldict, sampd) self.bwao = lbwao self.bbmapo = lbbmapo self.samfco = SamFC(cldict, sampd) self.jlco = CounterJL(cldict, sampd) print("Created JLCounter object")
def __init__(self, args): self.config_log_file = args.config_log_file self.sample_id = args.sample_id self.project_id = args.project_id self.prefix_set = args.prefix_set self.bc_set = args.bc_set cldict_d = yaml.load(open(self.config_log_file)) cldict = DictMap(cldict_d) self.cldict = cldict sampd = dict() sampd['sample_id'] = self.sample_id sampd['project_id'] = self.project_id sampd['prefix_set'] = self.prefix_set sampd['bc_set'] = self.bc_set sampd_map = DictMap(sampd) self.sampd = sampd_map mergo = Merger(cldict, sampd_map) self.mergo = mergo
def main(): parser = argparse.ArgumentParser( formatter_class=argparse.RawDescriptionHelpFormatter, description="""Merge osm file with address nodes with buildings in specified area as terc code""" ) parser.add_argument('--addr', help='File with address nodes to merge', required=True) parser.add_argument('--building', help='File with buildings to merge', required=True) parser.add_argument('--output', help='output file with merged data (default: result.osm)') parser.add_argument('--terc', help='Teryt TERC code for area processed') parser.add_argument('--log-level', help='Set logging level (debug=10, info=20, warning=30, error=40, critical=50), default: 20', dest='log_level', default=20, type=int) args = parser.parse_args() log_stderr = logging.StreamHandler() log_stderr.setLevel(args.log_level) logIO = io.StringIO() logging.basicConfig(level=10, handlers=[log_stderr, logging.StreamHandler(logIO)]) if args.output: output = open(args.output, "wb") else: parts = args.input.rsplit('.', 1) parts[0] += '-merged' output = open('.'.join(parts), "xb") print("Output filename: %s" % ('.'.join(parts),)) data = [OsmAddress.from_soup(x) for x in osm_to_json(lxml.etree.parse(open(args.addr)))['elements']] addr = osm_to_json(open(args.building)) m = Merger(data, addr, args.terc) for i in data: m._do_merge_create_point(i) m.create_index() m.merge_addresses() output.write(m.get_incremental_result(logIO))
def QC_tumor_normal(self): # Separate the runs into tumor and normal lists normal_runs, tumor_runs = self.getTumor_Normal() if self.sample_json['analysis']['settings'][ 'type'] == 'all_tumor_normal': # Use the sample_status here to not re-run the QC and to not overwrite run status. The 'sample_status' should be reset to 'pushed' when new runs are pushed.. #if self.sample_json['sample_status'] != 'pending_merge' and self.sample_json['sample_status'] != 'pending_3x3_review' and self.sample_json['sample_status'] != 'merged': # if the user specified the '--pass_fail' option, then run this part still if self.sample_json[ 'sample_status'] == 'pushed' or self.options.pass_fail or self.options.qc_all: # QC the normal or tumor runs with each other self.QC_runs(normal_runs, 'normal_') self.QC_runs(tumor_runs, 'tumor_') # now QC the tumor and normal runs together. self.QC_normal_tumor_runs(normal_runs, tumor_runs) # make the merger merger = Merger(self.sample_json, self.options.recalc_3x3_tables) # Check to see if the normal runs are ready to be merged. self.sample_json, merge_normal = merger.check_merge( normal_runs, 'Normal/', 'normal_') if merge_normal == True: # merge the normal and/or tumor runs. Will only merge the passing runs with each other. self.sample_json = merger.merge_runs('normal', 'Normal_', 'normal_') # Check to see if the tumor runs are ready to be merged. self.sample_json, merge_tumor = merger.check_merge( tumor_runs, 'Tumor/', 'tumor_') if merge_tumor == True: self.sample_json = merger.merge_runs('tumor', 'Tumor_', 'tumor_') # If any runs were merged, QC them. If there are only 1 normal and tumor run, they won't be QCd again. #if normal_merge_dir != '' or tumor_merge_dir != '' or (len(normal_passing_bams) == 1 and len(tumor_passing_bams) == 1): # now QC the tumor and normal merged bams together if both normal and tumor runs are ready. # To only QC all for the actual merged runs (PNET), change the 'final' part to 'merged'. # The 'final_normal_json' and 'final_tumor_json' flags are set by merger.py in the function check_merge, line 157 #if (merge_normal or merge_tumor) and ('merged_normal_json' in self.sample_json and 'merged_tumor_json' in self.sample_json): if 'final_normal_json' in self.sample_json and 'final_tumor_json' in self.sample_json: self.sample_json, qc_json = self.qc_run.QC_2Runs( self.sample_json, self.sample_json['final_normal_json'], self.sample_json['final_tumor_json'], 'normal_', 'tumor_', '_merged') self.sample_json, merged_perc_avail_bases = self.qc_run.update_3x3_runs_status( self.sample_json, self.sample_json['final_normal_json'], self.sample_json['final_tumor_json'], qc_json) # update the merged run status merger.update_merged_run_status( self.sample_json['final_normal_json'], merged_perc_avail_bases) merger.update_merged_run_status( self.sample_json['final_tumor_json'], merged_perc_avail_bases) # cleanup the individual run bam files if merged_perc_avail_bases > .9: final_qc_dir = "%s/all%svs%s" % ( self.sample_json['qc_folder'], json.load(open(self.sample_json['final_normal_json'])) ['run_name'], json.load(open( self.sample_json['final_tumor_json']))['run_name']) # annotate the final somatic variants command = "bash %s/Somatic_Variants/somatic_variants.sh %s %s %s" % ( self.sample_json['analysis']['software_directory'], final_qc_dir, self.sample_json['sample_name'], self.sample_json['analysis']['software_directory']) if runCommandLine(command) != 0: sys.stderr.write("ERROR: somatic annotation failed!\n") # Cleanup the PTRIM.bam and chr bam files after all of the QC is done. # are there any other files to clean up? self.cleanup_sample.cleanup_runs( self.sample_json['runs'], self.sample_json['analysis']['settings']['cleanup'], self.no_errors) #self.cleanup_sample.delete_runs(runs, self.sample_json['analysis']['settings']['cleanup'], self.no_errors) # Cleanup after the merging QC is done. self.cleanup_sample.cleanup_runs([ self.sample_json['final_normal_json'], self.sample_json['final_tumor_json'] ], self.sample_json['analysis']['settings']['cleanup'], self.no_errors) # Set the sample_status self.sample_json['sample_status'] = 'merged_pass' else: self.sample_json[ 'sample_status'] = 'awaiting_more_sequencing'
def QC_germline(self): # Use the sample_status here to not re-run the QC and to not overwrite run status. The 'sample_status' should be reset to 'pushed' when new runs are pushed.. #if self.sample_json['sample_status'] != 'pending_merge' and self.sample_json['sample_status'] != 'pending_3x3_review' and self.sample_json['sample_status'] != 'merged': # if the user specified the '--pass_fail' option, then run this part still if self.sample_json[ 'sample_status'] == 'pushed' or self.options.pass_fail or self.options.qc_all: # QC the normal runs with each other self.QC_runs(self.sample_json['runs']) # what if there is only one run that passes all of the metrics? It should be marked as the 'final_json' and have the 'pass_fail_merged' flag marked as pass. # make the merger merger = Merger(self.sample_json, self.options.recalc_3x3_tables) # Check to see if the normal runs are ready to be merged. self.sample_json, merge = merger.check_merge(self.sample_json['runs']) if merge != True: if 'final_json' in self.sample_json: # update the final run status merger.update_merged_run_status(self.sample_json['final_json']) elif merge == True: # merge the normal and/or tumor runs. Will only merge the passing runs with each other. self.sample_json = merger.merge_runs('germline') # update the merged run status merger.update_merged_run_status(self.sample_json['merged_json']) if json.load(open(self.sample_json['merged_json']) )['pass_fail_merged_status'] == 'pass': # Set the sample_status self.sample_json['sample_status'] = 'merged_pass' # cleanup the individual run bam files self.cleanup_sample.cleanup_runs( self.sample_json['runs'], self.sample_json['analysis']['settings']['cleanup'], self.no_errors) # Cleanup the merged dir self.cleanup_sample.cleanup_runs( [self.sample_json['merged_json']], self.sample_json['analysis']['settings']['cleanup'], self.no_errors) else: self.sample_json['sample_status'] = 'awaiting_more_sequencing' # copy the final run's VCF file to the final_dir if it passes the "merged" coverage flag if 'final_json' in self.sample_json: final_json = json.load(open(self.sample_json['final_json'])) if final_json['pass_fail_merged_status'] == 'pass': final_vcf = glob.glob("%s/*.vcf" % final_json['run_folder'])[0] final_project_dir = "/home/ionadmin/jeff/%s_Final_VCFs" % ( self.sample_json['project']) print "copying %s to %s" % (final_vcf, final_project_dir) # check to make sure the final dir exists. if not os.path.isdir(final_project_dir): os.mkdir(final_project_dir) shutil.copy( final_vcf, "%s/%s.vcf" % (final_project_dir, self.sample_json['sample_name'])) # now push the sample to s3 storage if self.sample_json['project'] == 'Einstein': print "pushing %s to amazon s3 storage" % self.sample_json[ 'sample_name'] self.push_sample_to_s3(final_json)
def test_adding_layers(self): m = Merger() l = Layer("1") m.addLayer(l) self.assertEqual(m.layers, [l]) m = Merger() l = Layer("1") l2 = Layer("1") m.addLayer(l) m.addLayer(l2) self.assertEqual(m.layers, [l2]) m.delLayer(l2) m.delLayer(l)
if __name__ == "__main__": if len(sys.argv) != 2: print "usage: python builder.py [filename]" archive = sys.argv[1] fp = FileParser() fp.extract(archive, "tmp") extract_all(archive) # get filename from archive: nz.tar => tmp/nz_merged files = fp.getFiles("tmp/" + archive.split(".")[0] + "_merged") t = time.time() r = IndexBuilder(files, UrlTable(), CParser(), Pipeline()) printf("parsing %d files:" % len(r.files)) r.process() print "\nparsed %d pages in %d files for %f seconds" % (r.page_id, r.id, time.time() - t) print "avarage %f second for parsing each files" % ((time.time() - t) / r.id) print "started to build revert_index: " t = time.time() m = Merger("tmp") m.merge() print "\nbuild reverted index for %d records in %f seconds" % (r.uid, (time.time() - t)) cleanup("tmp") build_index("rindex")
class SAM: def __init__(self, visualizer=None, speaker_recognition=False): self.merger_to_main_queue = Queue(maxsize=1000) # very roughly 30sec self.merger = Merger(self.merger_to_main_queue) if visualizer is None: self.visualization = False else: self.visualization = True self.main_to_vis_queue = Queue(maxsize=50) self.visualizer = visualizer(self.main_to_vis_queue) self.speakers = {} self.num_speakers = 0 self.stt = T2t_stt() self.speaker_recognition = speaker_recognition # if self.speaker_recognition: # self.sr = Speaker_recognition() self.text_queue = mult_Queue() self.bing_allowed = False def handle_service(self, req): rospy.loginfo("entered handle service") self.visualizer.idle = False # msg = Int32() # msg.data = 0 # self.ledmode_pub.publish(msg) queue = mult_Queue() self.bing_allowed = True p = Process(target=self.stt_subprocess, args=(queue, )) p.start() p.join() # msg = msg_Empty() # self.ledfreeze_pub.publish(msg) # self.visualizer.idle = True self.bing_allowed = False return queue.get() def stt_subprocess(self, q): # clear the text queue # rospy.loginfo("clear the text queue") # while not self.text_queue.empty(): # rospy.loginfo("got an item from the queue ->" + self.text_queue.get()) # wait for the next text to arrive rospy.loginfo("going to wait for the text_queue to be filled again") rate = rospy.Rate(1) while self.text_queue.empty() and not rospy.is_shutdown(): rospy.loginfo("still waiting, current length : " + str(self.text_queue.qsize())) rate.sleep() # put it into the return queue rospy.loginfo("got one and put it into the dedicated queue") q.put(self.text_queue.get()) def mode_callback(self, msg): if msg.mode == 2: self.visualizer.heartbeat = True def freeze_callback(self, msg): self.visualizer.heartbeat = False def run(self): self.merger.start() if self.visualization: self.visualizer.start() recording_id_odas = [0, 0, 0, 0] last_recording_id_odas = [0, 0, 0, 0] recordings = {} # request to speaker recognition waiting to be answered, key is the id, # value is the queue in which the result will be stored sr_requests = {} # kevins ros changes pub = rospy.Publisher('/roboy/cognition/sam/output', String, queue_size=10) rospy.Subscriber("/roboy/control/matrix/leds/mode", ControlLeds, self.mode_callback) rospy.Subscriber("/roboy/control/matrix/leds/freeze", msg_Empty, self.freeze_callback) # s = rospy.Service('/roboy/cognition/speech/recognition', RecognizeSpeech, self.handle_service) # self.ledmode_pub = rospy.Publisher("/roboy/control/matrix/leds/mode/simple", Int32, queue_size=3) # self.ledoff_pub = rospy.Publisher('/roboy/control/matrix/leds/off', msg_Empty, queue_size=10) # self.ledfreeze_pub = rospy.Publisher("/roboy/control/matrix/leds/freeze", msg_Empty, queue_size=1) # self.ledpoint_pub = rospy.Publisher("/roboy/control/matrix/leds/point", Int32, queue_size=1) rospy.init_node("SAM", anonymous=True) # operation average angle_list = [] while self.merger.is_alive() and not rospy.is_shutdown(): # we do ask for the next data block # maybe this is the place where i can insert a call and replace the while loop # wait for/get next data try: next_data = self.merger_to_main_queue.get(block=True, timeout=1) except q_Empty: continue # restart loop, but check again if we maybe got a stop signal cid = next_data['id_info'] caudio = next_data['audio_data'] ############################################################################################ # this part separates the 4 streams and manages the ones where currently audio is being recorded ######################################################################################### # cid[i] = [id, x, y, z, activity] for i in range(len(cid)): # len=4 recording_id_odas[i] = cid[i][0] if recording_id_odas[i] > 0: if recording_id_odas[i] == last_recording_id_odas[i]: # same person continues speaking recordings[recording_id_odas[i]].audio = np.append( recordings[recording_id_odas[i]].audio, caudio[i]) recordings[recording_id_odas[i]].currentpos = [ cid[i][1], cid[i][2], cid[i][3] ] else: # a person started speaking recordings[recording_id_odas[i]] = Recording( recording_id_odas[i], [cid[i][1], cid[i][2], cid[i][3]]) recordings[recording_id_odas[i]].audio = np.append( recordings[recording_id_odas[i]].audio, caudio[i]) # if a different person was speaking before, he is now done if last_recording_id_odas[i] > 0: recordings[ last_recording_id_odas[i]].stopped = True elif recording_id_odas[ i] == 0 and last_recording_id_odas[i] > 0: # if a different person was speaking before, he is now done recordings[last_recording_id_odas[i]].stopped = True last_recording_id_odas[i] = recording_id_odas[i] ########################################################## # check if we got any answers from sr (speaker recognition) in the meantime ############################################################# to_delete_req = [] for rec_id, req in sr_requests.iteritems(): try: # sr_id: -99 means new speaker # certainty between 0-10 certainty = 0 preliminary_id, sr_id, certainty = req.get(block=False) # Fuse info of speaker recognition on localization together # First the best case, both agree on an is/new speker if sr_id == recordings[rec_id].preliminary_speaker_id: # both agree, thats nice recordings[rec_id].final_speaker_id = recordings[ rec_id].preliminary_speaker_id recordings[rec_id].send_to_trainer = True elif recordings[ rec_id].created_new_speaker and sr_id == -99: # both agree, that this is a new speaker output_string = "both agree that rec %d is new speaker %d" % ( rec_id, recordings[rec_id].preliminary_speaker_id) rospy.logdebug(output_string) recordings[rec_id].final_speaker_id = recordings[ rec_id].preliminary_speaker_id recordings[rec_id].send_to_trainer = True else: # Now come the harder parts. if certainty < 1: # if speaker recognition is unsure we rely on localization recordings[rec_id].final_speaker_id = recordings[ rec_id].preliminary_speaker_id elif certainty > 8: # sr is super sure, we trust it recordings[rec_id].final_speaker_id = sr_id recordings[rec_id].sr_changed_speaker = True else: # check the angle the the speaker sr suggested, and depending on the certainty decide # go through the list of speaker angles and find the one one which sr suggests found = False for (oth_id, angl ) in recordings[rec_id].angles_to_speakers: if oth_id == sr_id: # the further we are away the shurer sr has to be if certainty * 20 > angl: recordings[ rec_id].final_speaker_id = sr_id recordings[ rec_id].sr_changed_speaker = True else: recordings[ rec_id].final_speaker_id = recordings[ rec_id].preliminary_speaker_id found = True break if not found: # this shouldn't happen output_string = "Speaker recognition suggestested id {} for recording {}," \ " which doesn't exist".format(sr_id, rec_id) rospy.logerr(output_string) recordings[ rec_id].final_speaker_id = recordings[ rec_id].preliminary_speaker_id output_string = "response for req %d, results is %d, certanty %d" % ( rec_id, sr_id, certainty) rospy.logdebug(output_string) recordings[rec_id].is_back_from_sr = True to_delete_req.append(rec_id) except q_Empty: if time.time() - recordings[ rec_id].time_sent_to_sr > 3: # no response from sr for 3 sec -> timeout # print("no response for request %d in 3 sec -> timeout" % (rec_id)) recordings[rec_id].final_speaker_id = recordings[ rec_id].preliminary_speaker_id recordings[rec_id].is_back_from_sr = True to_delete_req.append(rec_id) for req in to_delete_req: del sr_requests[req] ################################################################################## # here we go through our recordings and handle them based on their current status #################################################################################### to_delete = [] rec_info_to_vis = [] for rec_id, rec in recordings.iteritems(): if self.visualization and not rec.stopped: # convert audio to energy and append it to the tuple # Energy is the root mean square of the signal # E = sqrt(sum(s[n]^2)/N) curr_energy = np.sqrt(np.mean(np.square(rec.audio.data))) if not rec.stopped: rec_info_to_vis.append([ rec_id, rec.currentpos[0], rec.currentpos[1], rec.currentpos[2], 200, curr_energy ]) # 200 is the size of the blob else: rec_info_to_vis.append([ rec_id, rec.currentpos[0], rec.currentpos[1], rec.currentpos[2], 50, curr_energy ]) if rec.new: output_string = "new recording " + str(rec_id) rospy.loginfo(output_string) # get angles to all known speakers rec.get_angles_to_all_speakers(self.speakers, rec.startpos) # if it is wihthin a certain range to a known speaker, assign it to him if len( self.speakers ) > 0 and rec.angles_to_speakers[0][1] < 35: # degree output_string = "preliminary assigning recording %d to speaker %d, angle is %d" % ( rec_id, rec.angles_to_speakers[0][0], rec.angles_to_speakers[0][1]) rospy.loginfo(output_string) rec.preliminary_speaker_id = rec.angles_to_speakers[0][ 0] rec.final_speaker_id = rec.preliminary_speaker_id # this will be overwritten later else: # create a new speaker self.num_speakers += 1 new_id = self.num_speakers self.speakers[new_id] = Speaker(new_id, rec.startpos) rec.preliminary_speaker_id = new_id rec.final_speaker_id = rec.preliminary_speaker_id # this will be overwritten later rec.created_new_speaker = True closest_ang = -999 if len(rec.angles_to_speakers) > 0: closest_ang = rec.angles_to_speakers[0][1] output_string = "creating new speaker %d for recording %d, closest angle is %d" % ( new_id, rec_id, closest_ang) rospy.logdebug(output_string) if self.num_speakers == 1: rec.send_to_trainer = True rec.new = False # elif self.speaker_recognition and (not rec.was_sent_sr and rec.audio.shape[ # 0] > 16000 * 3): # its longer than 3 sec, time to send it to speaker recognition # sr_requests[rec_id] = Queue(maxsize=1) # self.sr.test(rec.audio, rec.preliminary_speaker_id, sr_requests[rec_id]) # rec.was_sent_sr = True # rec.time_sent_to_sr = time.time() elif rec.stopped: # speaker finished, handle this if not rec.alldone: if rec.audio.shape[ 0] < 16000 * 0.4: # everything shorter than this we simply discard output_string = "recording %d was too short, discarding" % ( rec_id) print output_string rospy.loginfo(output_string) if rec.created_new_speaker: del self.speakers[rec.preliminary_speaker_id] output_string = "thus also deleting speaker" + str( rec.preliminary_speaker_id) rospy.logdebug(output_string) rec.alldone = True if not rec.alldone: if (rec.was_sent_sr and rec.is_back_from_sr) or (not rec.was_sent_sr): if not rec.was_sent_sr: # it seems like this has been to short to be sent to rec.final_speaker_id = rec.preliminary_speaker_id self.speakers[ rec.final_speaker_id].pos = rec.currentpos if rec.created_new_speaker and rec.sr_changed_speaker: try: del self.speakers[ rec.preliminary_speaker_id] except: output_string = "Error deleting preliminary speaker " + str( rec.preliminary_speaker_id) print output_string rospy.logerr(output_string) # TODO: # send to speech to text if self.bing_allowed: text = self.stt.get_text(rec.audio) # wavfile.write(text.encode('utf-8') + ".wav", 16000, rec.audio.data) else: text = "bing is not allowed yet" # output_string = "Speaker {}: ".format(rec.final_speaker_id) + text.encode('utf-8') output_string = text.encode('utf-8') rospy.loginfo(output_string) pub.publish(output_string) if self.bing_allowed: self.text_queue.put(output_string) rospy.logdebug("text_queue lenght in main: " + str(self.text_queue.qsize())) # send this to trainer # if self.speaker_recognition and rec.send_to_trainer: # self.sr.train(rec.final_speaker_id, rec.audio) # output_string = "sending recording %d to trainer" % (rec_id) # rospy.logdebug(output_string) output_string = "succesfully handeld recording " + str( rec_id) rospy.logdebug(output_string) rec.alldone = True else: pass # wait for the response of sr if rec.alldone: to_delete.append(rec_id) for rec_id in to_delete: del recordings[rec_id] if self.visualization: try: self.main_to_vis_queue.put( { 'speakers': self.speakers, 'recordings': rec_info_to_vis }, block=False) except Full: # print("couldn't put data into visualization queue, its full") pass # --------------------------------------------------------------------------------------------------- # new doa to led addon # print # print "------------------------------------" # print "speakers: " # print self.speakers # print "rec_info_to_vis: " # operation average # if len(rec_info_to_vis) > 0 and not self.bing_allowed: # # print "0 -> ", rec_info_to_vis[0][0] # # print "1 -> ", rec_info_to_vis[0][1] # # print "2 -> ", rec_info_to_vis[0][2] # # print "3 -> ", rec_info_to_vis[0][3] # # print "4 -> ", rec_info_to_vis[0][4] # angle_list.append(rec_info_to_vis[0][1]) # if len(angle_list) >= 10: # publish_point_left_right(self.ledpoint_pub, sum(angle_list)/len(angle_list)) # angle_list = [] # else: # print "Empty dude" # print "------------------------------------" # print # publish_point(self.ledpoint_pub, rec_info_to_vis[1]) # --------------------------------------------------------------------------------------------------- output_string = "SAM is done." print output_string rospy.loginfo(output_string) self.merger.stop() if self.visualization: self.visualizer.stop() rospy.signal_shutdown("SAM is done.")
def my_fit(self, Xs, y, time_ramain, X_test): np.random.seed(CONSTANT.SEED) split = CONSTANT.SPLIT self.split = split log(f'split {split}') if split == -1: config = Config(time.time(), self.info['time_budget']) X_test.index = -X_test.index - 1 main_shape = Xs[CONSTANT.MAIN_TABLE_NAME].shape[0] main_max_shape = 2888888 main_min_shape = min(main_shape, 100000) test_shape = X_test.shape[0] max_accept_shape = 3999999 if main_shape + test_shape > max_accept_shape: sample_main_shape = max_accept_shape - test_shape if sample_main_shape > main_max_shape: sample_main_shape = main_max_shape if sample_main_shape < main_min_shape: sample_main_shape = main_min_shape log(f'start sample main table. origin main shape {main_shape} test shape {test_shape} sample rows num {sample_main_shape}' ) if 'time_col' in self.info: key_time_col = self.info['time_col'] if key_time_col in Xs[CONSTANT.MAIN_TABLE_NAME].columns: Xs[CONSTANT.MAIN_TABLE_NAME].sort_values( by=key_time_col, inplace=True) Xs[CONSTANT.MAIN_TABLE_NAME] = Xs[ CONSTANT.MAIN_TABLE_NAME].iloc[-sample_main_shape:] gc.collect() Xs[CONSTANT.MAIN_TABLE_NAME] = pd.concat( [Xs[CONSTANT.MAIN_TABLE_NAME], X_test]) X_test.drop(X_test.columns, axis=1, inplace=True) gc.collect() graph = Graph(self.info, Xs) graph.sort_tables() train_index = Xs[CONSTANT.MAIN_TABLE_NAME].index[ Xs[CONSTANT.MAIN_TABLE_NAME].index >= 0] y = y.loc[train_index] test_index = Xs[CONSTANT.MAIN_TABLE_NAME].index[ Xs[CONSTANT.MAIN_TABLE_NAME].index < 0] graph.preprocess_fit_transform() gc.collect() merge_feat_pipeline = DeafultMergeFeatPipeline() merger = Merger(merge_feat_pipeline) merger.merge_table(graph) main_table = merger.merge_to_main_fit_transform(graph) self.release_tables(Xs, graph) del merger del graph gc.collect() feat_pipeline = DefaultFeatPipeline() feat_engine = FeatEngine(feat_pipeline, config) feat_engine.fit_transform_order1(main_table, y) sample_for_combine_features = True if sample_for_combine_features: main_data = main_table.data train_data = main_data.loc[main_data.index >= 0] del main_data sample_num = CONSTANT.SAMPLE_NUM train_shape = train_data.shape if train_shape[0] <= sample_num: sample_for_combine_features = False else: data_tail_new = train_data.iloc[-sample_num:] gc.collect() y_tail_new = y.loc[data_tail_new.index] table_tail_new = copy.deepcopy(main_table) table_tail_new.data = data_tail_new del data_tail_new gc.collect() feat_engine.fit_transform_all_order2(table_tail_new, y_tail_new, sample=True) feat_engine.fit_transform_keys_order2(table_tail_new, y_tail_new, sample=True) del table_tail_new, y_tail_new gc.collect() feat_engine.fit_transform_all_order2(main_table, y, selection=False) feat_engine.fit_transform_keys_order2(main_table, y, selection=False) feat_engine.fit_transform_post_order1(main_table, y) if not sample_for_combine_features: gc.collect() feat_engine.fit_transform_all_order2(main_table, y) feat_engine.fit_transform_keys_order2(main_table, y) feat_engine.fit_transform_keys_order3(main_table, y) feat_engine.fit_transform_post_order1(main_table, y) del feat_engine gc.collect() X_test = main_table.data.loc[test_index] main_table.data = main_table.data.loc[train_index] gc.collect() test_table = copy.deepcopy(main_table) test_table.data = X_test self.test_table = test_table len_test = X_test.shape[0] gc.collect() feat_engine = FeatEngine(feat_pipeline, config) feat_engine.fit_transform_merge_order1(main_table, y) self.feat_engine = feat_engine feat_output = FeatOutput() self.feat_output = feat_output X, y, categories = feat_output.final_fit_transform_output( main_table, y) del main_table gc.collect() lgb = AutoLGB() lgb.param_compute(X, y, categories, config) X_train, y_train, X_test, y_test = time_train_test_split( X, y, test_rate=0.2) lgb.param_opt_new(X_train, y_train, X_test, y_test, categories) gc.collect() del X_train, y_train, X_test, y_test gc.collect() X, y = self.shuffle(X, y, 2019) gc.collect() lgb.ensemble_train(X, y, categories, config, len_test) gc.collect() importances = lgb.get_ensemble_importances() self.model = lgb del X, y elif split == -2: config = Config(time.time(), self.info['time_budget']) Xs[CONSTANT.MAIN_TABLE_NAME] = pd.concat([ Xs[CONSTANT.MAIN_TABLE_NAME], ]) gc.collect() graph = Graph(self.info, Xs) graph.sort_tables() train_index = Xs[CONSTANT.MAIN_TABLE_NAME].index[ Xs[CONSTANT.MAIN_TABLE_NAME].index >= 0] y = y.loc[train_index] graph.preprocess_fit_transform() gc.collect() merge_feat_pipeline = DeafultMergeFeatPipeline() merger = Merger(merge_feat_pipeline) merger.merge_table(graph) main_table = merger.merge_to_main_fit_transform(graph) self.release_tables(Xs, graph) del merger del graph gc.collect() feat_pipeline = DefaultFeatPipeline() feat_engine = FeatEngine(feat_pipeline, config) feat_engine.fit_transform_order1(main_table, y) sample_for_combine_features = True if sample_for_combine_features: main_data = main_table.data train_data = main_data.loc[main_data.index >= 0] del main_data sample_num = CONSTANT.SAMPLE_NUM train_shape = train_data.shape if train_shape[0] <= sample_num: sample_for_combine_features = False else: data_tail_new = train_data.iloc[-sample_num:] gc.collect() log(f'sample data shape {data_tail_new.shape}') y_tail_new = y.loc[data_tail_new.index] table_tail_new = copy.deepcopy(main_table) table_tail_new.data = data_tail_new del data_tail_new gc.collect() feat_engine.fit_transform_all_order2(table_tail_new, y_tail_new, sample=True) feat_engine.fit_transform_keys_order2(table_tail_new, y_tail_new, sample=True) del table_tail_new, y_tail_new gc.collect() feat_engine.fit_transform_all_order2(main_table, y, selection=False) feat_engine.fit_transform_keys_order2(main_table, y, selection=False) feat_engine.fit_transform_post_order1(main_table, y) if not sample_for_combine_features: gc.collect() feat_engine.fit_transform_all_order2(main_table, y) feat_engine.fit_transform_keys_order2(main_table, y) feat_engine.fit_transform_keys_order3(main_table, y) feat_engine.fit_transform_post_order1(main_table, y) del feat_engine gc.collect() main_table.data = main_table.data.loc[train_index] gc.collect() def split_table(table, y): X = table.data X_train, y_train, X_test, y_test = time_train_test_split( X, y, shuffle=False, test_rate=0.2) table1 = copy.deepcopy(table) table1.data = X_train table2 = copy.deepcopy(table) table2.data = X_test return table1, y_train, table2, y_test table1, y_train, table2, y_test = split_table(main_table, y) feat_engine = FeatEngine(feat_pipeline, config) feat_engine.fit_transform_merge_order1(table1, y_train) self.feat_engine = feat_engine feat_output = FeatOutput() self.feat_output = feat_output X_train, y_train, categories = feat_output.fit_transform_output( table1, y_train) gc.collect() self.feat_engine.transform_merge_order1(table2) X_test = self.feat_output.transform_output(table2) lgb = AutoLGB() lgb.param_compute(X_train, y_train, categories, config) lgb.param_opt_new(X_train, y_train, X_test, y_test, categories) len_test = X_test.shape[0] lgb.ensemble_train(X_train, y_train, categories, config, len_test) gc.collect() pred, pred0 = lgb.ensemble_predict_test(X_test) auc = roc_auc_score(y_test, pred0) print('source AUC:', auc) auc = roc_auc_score(y_test, pred) Model.ensemble_auc.append(auc) print('ensemble AUC:', auc) importances = lgb.get_ensemble_importances() self.model = lgb del X_train, y_train, X_test, y_test gc.collect() paths = os.path.join(feature_importance_path, version) if not os.path.exists(paths): os.makedirs(paths) importances.to_csv(os.path.join( paths, '{}_importances.csv'.format( datetime.now().strftime('%Y%m%d%H%M%S'))), index=False)
class Application(Frame): def __init__(self, master = None): Frame.__init__(self, master) self.opend_file = None background = 'white' text_background = '#EEE' master.configure(bg = background) master.minsize(600, 600) master.title('代码合并器') master.rowconfigure(1, weight = 1) for col in range(10): self.master.columnconfigure(col, weight = 1) self.label_src = Label(master, text = '原始码', bg = background) self.label_src.grid(row = 0, column = 0, rowspan = 1, columnspan = 5, sticky = W+S, padx = 10, pady = 10) self.text_src = Text(master, bg = text_background) self.text_src.grid(row = 1, column = 0, rowspan = 1, columnspan = 5, sticky = W+E+N+S, padx = (10, 5), pady = 0) self.label_dst = Label(master, text = '精简码', bg = background) self.label_dst.grid(row = 0, column = 5, rowspan = 1, columnspan = 5, sticky = W+S, padx = 5, pady = 10) self.text_dst = Text(master, bg = text_background) # self.text_dst.config(state = 'disable') self.text_dst.grid(row = 1, column = 5, rowspan = 1, columnspan = 5, sticky = W+E+N+S, padx = (5, 10), pady = 0) self.button_open = Button(master, text='导入', width = '10', bg = background, command = self.open) self.button_open.grid(row = 2, column = 2, rowspan = 1, columnspan = 2, sticky = N+S, pady = 10) self.button_merge = Button(master, text='合并', width = '10', bg = background, command = self.merge) self.button_merge.grid(row = 2, column = 4, rowspan = 1, columnspan = 2, sticky = N+S, pady = 10) self.button_save = Button(master, text='导出', width = '10', bg = background, command = self.save) self.button_save.grid(row = 2, column = 6, rowspan = 1, columnspan = 2, sticky = N+S, pady = 10) def open(self): self.openfile = tkFileDialog.askopenfile(mode = 'r', defaultextension=".txt") text = self.openfile.read() print 'File loaded.' print text self.text_src.delete(0.0, END) self.text_src.insert(END, text) def merge(self): # self.text_dst.config(state = 'normal') text = self.text_src.get('1.0', END) # print text.encode('utf-8') codes2num = decode(text) # print codes2num self.merger = Merger(codes2num) self.text_dst.delete(0.0, END) result_text = '' for k in range(10, 3, -1): result_text += '最大长度' + str(k) + ' ' result_text += encode(self.merger.merge_result(k)) # print result_text self.text_dst.insert(END, result_text) # self.text_dst.config(state = 'disable') def save(self): self.savefile = tkFileDialog.asksaveasfile(mode = 'w', defaultextension=".txt") text = self.text_dst.get(0.0, END) # print text.encode('utf-8') self.savefile.write(text.encode('utf-8')) self.savefile.close()
from merger import Merger if __name__ == '__main__': m = Merger('file_one.txt', 'file_two.txt') m.run()
def __call__(self, full_table, bad_tables, good_tables, **kwargs): """ table has been trimmed of extraneous columns. """ self.setup_tables(full_table, bad_tables, good_tables, **kwargs) self.SCORE_ID = add_meta_column( chain(self.bad_tables, self.good_tables), SCORE_VAR) self.CLASS_ID = add_meta_column(chain(self.bad_tables, self.good_tables), "INFCLASS", vals=['0', '1']) start = time.time() self.compute_perrow_influences(self.bad_tables, self.bad_err_funcs) self.compute_perrow_influences(self.good_tables, self.good_err_funcs) self.cost_compute_inf = time.time() - start start = time.time() if self.tree_alg == 'c45': table, rules = self.c45_rules() elif self.tree_alg == 'or': table, rules = self.orange_dt_rules() elif self.tree_alg == 'dt': table, rules = self.sk_dt_rules(max_depth=12) elif self.tree_alg == 'rt': table, rules = self.sk_rt_rules(max_depth=12) else: _logger.warn( "unknown NDT algorithm %s. Defaulting to regression tree", self.tree_alg) table, rules = self.sk_rt_rules(max_depth=12) self.cost_learn = time.time() - start # # ok now convert rules to clusters # _logger.debug("got %d rules", len(rules)) fill_in_rules(rules, table, cols=self.cols) self.cost_learn = time.time() - start clusters = [Cluster.from_rule(rule, self.cols) for rule in rules] for cluster in clusters: cluster.error = self.influence_cluster(cluster) clusters = filter_bad_clusters(clusters) clusters.sort(key=lambda c: c.error, reverse=True) print '\n'.join(map(str, clusters[:5])) self.all_clusters = self.final_clusters = clusters return self.final_clusters # # merge the clusters # thresh = compute_clusters_threshold(clusters, nstds=1.5) is_mergable = lambda c: c.error >= thresh params = dict(kwargs) params.update({ 'cols': self.cols, 'err_func': self.err_func, 'influence': lambda c: self.influence_cluster(c), 'influence_components': lambda c: self.influence_cluster_components(c), 'is_mergable': is_mergable, 'use_mtuples': False, 'learner': self }) self.merger = Merger(**params) merged_clusters = self.merger(clusters) merged_clusters.sort(key=lambda c: c.error, reverse=True) clusters.extend(merged_clusters) normalize_cluster_errors(clusters) clusters = list(set(clusters)) self.all_clusters = clusters self.final_clusters = merged_clusters self.costs = {'cost_learn': self.cost_learn} return self.final_clusters
def assemble(self): """ Builder method: build a Chain of linked Components :return: """ log.info('Assembling Chain: %s...' % self.chain_str) # Create linked list of input/filter/output (ETL Component) objects chain_str = self.chain_str sub_comps = [] while chain_str: chain_str = chain_str.strip() # Check and handle Splitter construct # e.g. input_xml_file |(transformer_xslt|output_file) (output_std) (transformer_xslt|output_std) if chain_str.startswith('('): etl_section_name, chain_str = chain_str.split(')', 1) etl_section_name = etl_section_name.strip('(') # Check for subchain (split at Filter level) if '|' in etl_section_name: # Have subchain: use Chain to assemble sub_chain = Chain(etl_section_name, self.config_dict) sub_chain.assemble() child_comp = sub_chain.first_comp else: # Single component (Output) to split child_comp = factory.create_obj(self.config_dict, etl_section_name.strip()) # Assemble Components (can be subchains) for Splitter later sub_comps.append(child_comp) if '(' in chain_str: # Still components (subchains) to assemble for Splitter continue if len(sub_comps) > 0: if chain_str.startswith('|'): # Next component is Merger with children etl_comp = Merger(self.config_dict, sub_comps) dummy, chain_str = chain_str.split('|', 1) else: # Next component is Splitter with children etl_comp = Splitter(self.config_dict, sub_comps) sub_comps = [] else: # "Normal" case: regular Components piped in Chain if '|' in chain_str: # More than one component in remaining Chain etl_section_name, chain_str = chain_str.split('|', 1) else: # Last element, we're done! etl_section_name = chain_str chain_str = None # Create the ETL component by name and properties etl_comp = factory.create_obj(self.config_dict, etl_section_name.strip()) # Add component to end of Chain self.add(etl_comp)
parser = OptionParser(version="%prog " + __VERSION__, usage=usage, description=banner) parser.add_option("--dir", "-d", action="store", type="string", dest="dir", help="Files match (Default: *.ics)", default="*.ics") parser.add_option("--ical", "-i", action="store", type="string", dest="icalfile", help="iCalendar file output") (options, args) = parser.parse_args() if options.icalfile == "": options.icalfile = None if options.icalfile != None: options.icalfile = os.path.realpath(options.icalfile) Merger(options.dir, options.icalfile) sys.exit(0) sys.exit(1)
def QC_merge_runs(self): # if this is a germline sample, QC all of the normal runs with each other. if self.sample_json['sample_type'] == 'germline': # Use the sample_status here to not re-run the QC and to not overwrite run status. The 'sample_status' should be reset to 'pushed' when new runs are pushed.. #if self.sample_json['sample_status'] != 'pending_merge' and self.sample_json['sample_status'] != 'pending_3x3_review' and self.sample_json['sample_status'] != 'merged': # if the user specified the '--pass_fail' option, then run this part still if self.sample_json['sample_status'] == 'pushed' or self.options.pass_fail or self.options.qc_all: # QC the normal runs with each other self.QC_runs(self.sample_json['runs']) # write the sample json file write_json(self.sample_json['json_file'], self.sample_json) # what if there is only one run that passes all of the metrics? It should be marked as the 'final_json' and have the 'pass_fail_merged' flag marked as pass. # make the merger merger = Merger(self.sample_json['json_file']) # Check to see if the normal runs are ready to be merged. merge = merger.check_merge(self.sample_json['runs']) if merge == True: # merge the normal and/or tumor runs. Will only merge the passing runs with each other. merger.merge_runs('germline') # load the sample json file because merger edited it. self.sample_json = json.load(open(self.sample_json['json_file'])) # update the merged run status merger.update_merged_run_status(self.sample_json['merged_json']) if json.load(open(self.sample_json['merged_json']))['pass_fail_merged_status'] == 'pass': # Set the sample_status self.sample_json['sample_status'] = 'merged' # cleanup the individual run bam files self.cleanup_sample.cleanup_runs(self.sample_json['runs'], self.sample_json['analysis']['settings']['cleanup'], self.no_errors) # Cleanup the merged dir self.cleanup_sample.cleanup_runs([self.sample_json['merged_json']], self.sample_json['analysis']['settings']['cleanup'], self.no_errors) else: self.sample_json['sample_status'] = 'awaiting_more_sequencing' # if this is a tumor_normal sample, find the normal and tumor runs, and then QC them with each other. elif self.sample_json['sample_type'] == 'tumor_normal': # Separate the runs into tumor and normal lists normal_runs, tumor_runs = self.getTumor_Normal() if self.sample_json['analysis']['settings']['type'] == 'all_tumor_normal': # Use the sample_status here to not re-run the QC and to not overwrite run status. The 'sample_status' should be reset to 'pushed' when new runs are pushed.. #if self.sample_json['sample_status'] != 'pending_merge' and self.sample_json['sample_status'] != 'pending_3x3_review' and self.sample_json['sample_status'] != 'merged': # if the user specified the '--pass_fail' option, then run this part still if self.sample_json['sample_status'] == 'pushed' or self.options.pass_fail or self.options.qc_all: # QC the normal or tumor runs with each other self.QC_runs(normal_runs, 'normal_') self.QC_runs(tumor_runs, 'tumor_') # now QC the tumor and normal runs together. self.QC_normal_tumor_runs(normal_runs, tumor_runs) # make the excel spreadsheet containing the data and copy it back to the proton #self._make_xlsx() # write the sample json file write_json(self.sample_json['json_file'], self.sample_json) # make the merger merger = Merger(self.sample_json['json_file']) # Check to see if the normal runs are ready to be merged. merge_normal = merger.check_merge(normal_runs, 'Normal/', 'normal_') if merge_normal == True: # merge the normal and/or tumor runs. Will only merge the passing runs with each other. merger.merge_runs('normal', 'Normal_', 'normal_') # Check to see if the tumor runs are ready to be merged. merge_tumor = merger.check_merge(tumor_runs, 'Tumor/', 'tumor_') if merge_tumor == True: merger.merge_runs('tumor', 'Tumor_', 'tumor_') # load the sample json file because merger edited it. self.sample_json = json.load(open(self.sample_json['json_file'])) # If any runs were merged, QC them. If there are only 1 normal and tumor run, they won't be QCd again. #if normal_merge_dir != '' or tumor_merge_dir != '' or (len(normal_passing_bams) == 1 and len(tumor_passing_bams) == 1): # only QC all for the actual merged runs for now (PNET). # now QC the tumor and normal merged bams together if both normal and tumor runs are ready. if merge_normal or merge_tumor and ('merged_normal_json' in self.sample_json and 'merged_tumor_json' in self.sample_json): self.sample_json, qc_json = self.qc_run.QC_2Runs(self.sample_json, self.sample_json['merged_normal_json'], self.sample_json['merged_tumor_json'], 'normal_', 'tumor_', '_merged') self.sample_json, merged_perc_avail_bases = self.qc_run.update_3x3_runs_status(self.sample_json, self.sample_json['merged_normal_json'], self.sample_json['merged_tumor_json'], qc_json) # update the merged run status merger.update_merged_run_status(self.sample_json['merged_normal_json'], merged_perc_avail_bases) merger.update_merged_run_status(self.sample_json['merged_tumor_json'], merged_perc_avail_bases) # cleanup the individual run bam files if merged_perc_avail_bases > .9: # Cleanup the PTRIM.bam and chr bam files after all of the QC is done. # are there any other files to clean up? self.cleanup_sample.cleanup_runs(self.sample_json['runs'], self.sample_json['analysis']['settings']['cleanup'], self.no_errors) #self.cleanup_sample.delete_runs(runs, self.sample_json['analysis']['settings']['cleanup'], self.no_errors) # Cleanup after the merging QC is done. self.cleanup_sample.cleanup_runs([self.sample_json['final_normal_json'], self.sample_json['final_tumor_json']], self.sample_json['analysis']['settings']['cleanup'], self.no_errors) # Set the sample_status self.sample_json['sample_status'] = 'merged_pass' else: self.sample_json['sample_status'] = 'awaiting_more_sequencing' # print the final status if self.no_errors == False or self.qc_run.no_errors == False: sys.stderr.write("%s finished with errors. See %s/sge.log for more details"%(self.sample_json['sample_name'], self.sample_json['output_folder'])) self.sample_json['sample_status'] == 'failed' write_json(self.sample_json['json_file'], self.sample_json) sys.exit(1) else: print "%s finished with no errors!"%(self.sample_json['sample_name']) # write the sample json file write_json(self.sample_json['json_file'], self.sample_json) # make the excel spreadsheet containing the data and copy it back to the proton self._make_xlsx()
import matplotlib.pyplot as plt import numpy as np from keras import Sequential from keras.callbacks import History from keras.layers import Dense, BatchNormalization from sklearn.model_selection import train_test_split from loader import Loader from merger import Merger params, scores = Loader.get_flow_data(6767, 100) qualities = Loader.get_task_qualities() description = Loader.get_description(6767) merger = Merger(params, description, scores, qualities) X, y = merger.merge(100) # Split X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42) # model # --> 0.0017802061972600456 model = Sequential() model.add(Dense(32, input_shape=(X.shape[1], ), activation='relu')) model.add(BatchNormalization()) model.add(Dense(32, activation='relu')) model.add(BatchNormalization())
value = 0 for code, num in case.items(): print code, num, value += num print print value print str(timeit.timeit('test(case)', 'from __main__ import test, case', number=1)) + 's used.' TEST_CASES = [ ['012', '013', '023', '123'], ['012', '013', '023', '124', '134', '234'], ['012', '013', '023', '123', '124', '134', '234'], ['012', '013', '023', '123', '123', '124', '134', '234'], ['012', '013', '014', '023', '024', '034', '123', '124', '134', '234'], ['012', '023', '013', '123', '123', '234', '134', '124', '125', '127', '157', '257', '125', '127', '157'], ['012', '023', '013', '123', '123', '234', '134', '124', '125', '125', '127', '157', '257', '125', '127', '157'], ] codes = Merger([]).get_codes(3) case = TEST_CASES[0] # case = [to_string(random.choice(codes)) for dummy_i in range(10000)] # case = TEST_CASES[3] * 10 # case = TEST_CASES[0] + TEST_CASES[4] # case = TEST_CASES[6] # print str(timeit.timeit('test(case)', 'from __main__ import test, case', number=1)) + 's used.' # print len(" sdsd\n\n \t".strip())
def disc_merge(self, cluster, dim, vals, skip=None): merged = Merger.disc_merge(self, cluster, dim, vals, skip) if merged: merged.c_range = list(self.c_range) merged.inf_func = merged.create_inf_func(self.learner.l) return merged