Beispiel #1
0
def merge_videos():
    global mer

    path_name_seg = []
    for i in all_filename:
        path_name_seg.append(
            os.path.join(target_path, sel_res.getVideoTitle(), i))
    mer = Merger(
        unicode(
            os.path.join(
                target_path,
                sel_res.getVideoTitle() + '.' + sel_res.getFileFormat())),
        path_name_seg)
    gui.frame_main.initTotal_Merge(len(all_filename))
    mer.start()

    while True:
        gui.frame_main.updateMerge(mer.now)
        time.sleep(0.05)
        if mer.now == mer.sum:
            gui.frame_main.updateMerge(mer.now)
            break

    with open('config.ini', 'wb') as f:
        save_configure()

    dlg = wx.MessageDialog(gui.frame_main, u'视频已经合并完成,是否删除分段文件?', u'提示',
                           wx.YES_NO | wx.ICON_QUESTION)
    if dlg.ShowModal() == wx.ID_YES:
        del_seg_video()
    dlg = wx.MessageDialog(gui.frame_main, u'分段文件删除完成。', u'提示',
                           wx.OK | wx.ICON_QUESTION)
    dlg.ShowModal()
Beispiel #2
0
def process_multiple(log, do_fetch=True, do_parse=True, do_merge=True):
    root = config["data-dir"]

    if do_fetch:
        tokens = Tokens()
        api = API(tokens, log)
        util.delete_files(root + '/processing/invoices', '*.json')
        success, invoice_cnt = api.fetch_invoice_details(hours_delta=30,
                                                         tz_offset=7)
        if success and invoice_cnt > 0:
            log.write(
                "INFO api invoices extraction succeeded {:,} invoices saved to : {}"
                .format(invoice_cnt, '/processing/invoices'))
        elif success and invoice_cnt == 0:
            log.write(
                "INFO api no invoices extracted (no new/updated invoices in refresh period)"
            )
            return True
        else:
            log.write(
                "ERROR api invoices extraction failed {:,} invoices saved to : {}"
                .format(invoice_cnt, '/processing/invoices'))
            return False

    if do_parse:
        util.delete_files(root + '/processing/invoices', '*.csv')
        parser = Parser(log)
        parser.parse('invoices-line-items')

    if do_merge:
        merger = Merger(log)
        merger.merge_invoice_delta()

    return True
Beispiel #3
0
def test(test_case):
    merger = Merger(test_case)
    # codes = merger.get_codes(3)

    for k in range(10, 9, -1):
        value_sum = 0
        for code, num in merger.merge_result(k).items():
            value_sum += num
        print 'k =', k, merger.merge_result(k), ' \tMerged codes:', len(merger.merge_result(k)), ' \tTotal value:', value_sum
Beispiel #4
0
  def __init__(self, *args, **kwargs):
    Merger.__init__(self, *args, **kwargs)

    self.learner_hash = kwargs.get('learner_hash', '')
    self.c_range = kwargs.get('c_range', [0.01, 0.7])
    self.get_frontier = Frontier(self.c_range, 0.)
    self.CACHENAME = './dbwipes.rangemerger.cache'
    self.i = 0
    self.yrange = None
Beispiel #5
0
    def __init__(self, **kwargs):
        self.full_table = None
        self.bad_tables = []
        self.good_tables = []
        self.bad_err_funcs = []
        self.good_err_funcs = []
        self.err_func = None
        self.cols = None

        Merger.__init__(self, **kwargs)
Beispiel #6
0
    def test_merge_simple(self):
        m = Merger()
        l = Layer("1")

        l.addChannel(1, 255)
        l.addChannel(2, 127)

        m.addLayer(l)
        m.merge()
        self.assertEqual(m.galaxy[1], 255)
        self.assertEqual(m.galaxy[2], 127)
Beispiel #7
0
  def setup_stats(self, clusters):
    """
    computes error bounds and the minimum volume of a 0-volume cluster

    adds data structures to cluster object
    """
    Merger.setup_stats(self, clusters)

    for c in clusters:
      c.inf_func = c.create_inf_func(self.learner.l)
      c.c_range = list(self.c_range)
      c.inf_range = [c.inf_func(c.c_range[0]), c.inf_func(c.c_range[1])]
Beispiel #8
0
    def setup_stats(self, clusters):
        """
    computes error bounds and the minimum volume of a 0-volume cluster

    adds data structures to cluster object
    """
        Merger.setup_stats(self, clusters)

        for c in clusters:
            c.inf_func = self.learner.create_inf_func(c)
            c.c_range = list(self.c_range)
            c.inf_range = [c.inf_func(c.c_range[0]), c.inf_func(c.c_range[1])]
    def merge(self, corpus_size):
        """
        The function will merge all the data in the posting files using the BSBI algorithm
        """
        docs_file = self.get_docs_file()
        for key in self.postings_data:
            if os.listdir(self.postings_data[key]['path']):  # directory is not empty
                merger = Merger(self.postings_data[key]['path'], "pkl", docs_file, corpus_size)
                merger.merge(self.postings_data[key]['name'])

        #  The merger updates the docs data. After the merge of all the letters - all the documents data
        #  Is updated and need to be saved on disk to reduce the memory load
        utils.save_obj(docs_file, f"{self.posting_dir_path}\\docs\\docs_index")
Beispiel #10
0
def get_IMPA_Merger(name):
    imp = iMPA(name)
    terc = imp.terc
    data = imp.getAddresses()
    s = min(map(lambda x: x.center.y, data))
    w = min(map(lambda x: x.center.x, data))
    n = max(map(lambda x: x.center.y, data))
    e = max(map(lambda x: x.center.x, data))
    addr = getAddresses(map(str, (s, w, n, e)))

    m = Merger(data, addr, terc)
    m.post_func.append(m.merge_addresses)
    m.merge()
    return m
Beispiel #11
0
def get_impa_merger(name):
    imp = iMPA(name)
    terc = imp.terc
    data = imp.get_addresses()
    s = min(map(lambda x: x.center.y, data))
    w = min(map(lambda x: x.center.x, data))
    n = max(map(lambda x: x.center.y, data))
    e = max(map(lambda x: x.center.x, data))
    addr = get_addresses(map(str, (s, w, n, e)))

    m = Merger(data, addr, terc, "%s.e-mapa.net" % name)
    m.post_func.append(m.merge_addresses)
    m.merge()
    return m
Beispiel #12
0
    def test_merge_complete(self):
        m = Merger()
        l1 = Layer("1")
        l1.addChannel(2, 1)
        l1.addChannel(3, 255)
        l1.addChannel(4, 127)
        
        l2 = Layer("2")
        l2.addChannel(3, 0, 0.5)
        l2.addChannel(4, 255, "max")
        l2.addChannel(5, 255, "min")
        
        l3 = Layer("3")
        l3.addChannel(2, 255, 0.3)
        
        l4 = Layer("4")
        l4.addChannel(2, 127, 0.6)

        m.addLayer(l1)
        m.addLayer(l2)
        m.addLayer(l3)
        m.addLayer(l4)
        m.merge()

        self.assertEqual(m.galaxy[1], 0)
        self.assertEqual(m.galaxy[2], 107)
        self.assertEqual(m.galaxy[3], 128)
        self.assertEqual(m.galaxy[4], 255)
        self.assertEqual(m.galaxy[5], 0)
Beispiel #13
0
    def __init__(self, visualizer=None, speaker_recognition=False):

        self.merger_to_main_queue = Queue(maxsize=1000)  # very roughly 30sec
        self.merger = Merger(self.merger_to_main_queue)
        if visualizer is None:
            self.visualization = False
        else:
            self.visualization = True
            self.main_to_vis_queue = Queue(maxsize=50)
            self.visualizer = visualizer(self.main_to_vis_queue)

        self.speakers = {}
        self.num_speakers = 0
        self.stt = T2t_stt()
        self.speaker_recognition = speaker_recognition
        # if self.speaker_recognition:
        #     self.sr = Speaker_recognition()
        self.text_queue = mult_Queue()
        self.bing_allowed = False
Beispiel #14
0
    def set_params(self, **kwargs):
        print kwargs.keys()
        self.cols = kwargs.get('cols', self.cols)
        self.full_table = kwargs.get('full_table', self.full_table)
        self.bad_tables  = kwargs.get('bad_tables', self.bad_tables)
        self.good_tables = kwargs.get('good_tables', self.good_tables)
        self.bad_err_funcs = kwargs.get('bad_err_funcs', self.bad_err_funcs)
        self.good_err_funcs = kwargs.get('good_err_funcs', self.good_err_funcs)
        assert self.bad_tables is not None, "table not set"
        assert self.bad_err_funcs is not None, "error func not set"

        self.table = self.full_table

        domain = self.full_table.domain
        attrnames = [attr.name for attr in domain]
        self.cont_dists = dict(zip(attrnames, Orange.statistics.basic.Domain(self.full_table)))
        self.disc_dists = dict(zip(attrnames, Orange.statistics.distribution.Domain(self.full_table)))

        Merger.set_params(self, **kwargs)
Beispiel #15
0
    def __init__(self, *args, **kwargs):
        Merger.__init__(self, *args, **kwargs)

        self.learner_hash = kwargs.get('learner_hash', '')
        self.c_range = kwargs.get('c_range', [0.01, 0.7])
        self.get_frontier = Frontier(self.c_range, 0.)
        self.CACHENAME = './dbwipes.rangemerger.cache'
        self.i = 0

        #
        # per execution state
        #

        # dim -> list of value subsets that were not on frontier
        # e.g., subregion -> [ (SR1, SR2), (SR3), ... ]
        self.rejected_disc_vals = defaultdict(list)

        # (dim, direction) -> range it has expanded along
        self.rejected_cont_vals = defaultdict(set)
Beispiel #16
0
 def __init__(self, indent=None, to_explore=False):
     self.indent = indent
     self.indent_children = None
     self.content = []
     self.parent = None
     self.to_explore = to_explore
     self.merger = Merger()
     self.padding = None
     self.sf = None
     self.sc = None
Beispiel #17
0
def ProcessRequest(file):

    name = str(uuid4())
    base_file_name = "%s-%s" % (name, secure_filename(file.filename))
    file_name = "tmp/%s" % base_file_name
    print(file_name)
    file.save(file_name)
    with ZipFile(file_name, 'r') as zipObj:
        zipObj.extractall("tmp/%s" % name)
    Merger("tmp/%s" % name, os.path.realpath("tmp/combined-%s.ics" % name))
Beispiel #18
0
def merge_videos():
    _, res = iqiyi.getLastRes()
    path_name_seg = []
    for i in all_filename:
        path_name_seg.append(os.path.join(video_title, i))
    mer = Merger(
        unicode(
            os.path.join(target_path, video_title + '.' + res[sel_bid]['ff'])),
        path_name_seg)
    gui.frame_main.initTotal_Merge(len(all_filename))
    mer.start()
    while True:
        gui.frame_main.updateMerge(mer.now)
        time.sleep(0.01)
        if mer.now == mer.sum:
            gui.frame_main.updateMerge(mer.now)
            break

    del_seg_video()
Beispiel #19
0
    def merge(self):
        # self.text_dst.config(state = 'normal')

        text = self.text_src.get('1.0', END)
        # print text.encode('utf-8')
        codes2num = decode(text)
        # print codes2num

        self.merger = Merger(codes2num)

        self.text_dst.delete(0.0, END)

        result_text = ''
        for k in range(10, 3, -1):
            result_text += '最大长度' + str(k) + ' '
            result_text += encode(self.merger.merge_result(k))
            # print result_text

        self.text_dst.insert(END, result_text)
Beispiel #20
0
 def dim_merge(self, cluster, dim, dec=None, inc=None, skip=None):
     if dec is not None:
         if round(dec, 1) in self.rejected_cont_vals[(dim, 'dec')]:
             return None
     if inc is not None:
         if round(inc, 1) in self.rejected_cont_vals[(dim, 'inc')]:
             return None
     merged = Merger.dim_merge(self, cluster, dim, dec, inc, skip)
     if merged:
         merged.c_range = list(self.c_range)
         merged.inf_func = self.learner.create_inf_func(merged)
     return merged
Beispiel #21
0
def merge():
    try:
        if request.method == 'OPTIONS':
            return make_response(jsonify({"Allow":"POST"}), 200)

        if not request.json or not 'foreground_url' in request.json or not 'background_url' in request.json:
            abort(400)

        foreground_url = request.json['foreground_url']
        background_url = request.json['background_url']
        m = Merger(foreground_url, background_url)
        m.merge_images()
        response = {
            'output_image':{
                'name': m.get_output_image('name'),
                'url' : url_for('get_image', image_name = m.get_output_image('name'),_external=True),
                'base64' : m.get_output_image('base64')
            }
        }
        return jsonify(response), 201
    except Exception as e:
        err_msg = e.message
        if  err_msg == '':
            err_msg = 'Internal Error. Please Try Again'
        return make_response(jsonify({'error': e.message}), 202)
def main():
	scrapper = Scrapper()
	merger = Merger()
	parser = Parser()
	client = MongoClient('localhost', 27017)
	db = client['Data']
	collection_socialmedia = db['socialmedia']

	#Begin real time collecting
	while True: 
		scrapper.scrap()	
		merger.main()
		parser.main()	
		sleep(3600)
		
		#Storing to mangoDB
		f = open( '/home/sartharion/Bureau/stage/POO/data.json', 'r')  
		file_data = json.load(f)
		collection_socialmedia.delete_many({})
		collection_socialmedia.insert_many(file_data)		
	
	client.close()
Beispiel #23
0
    def __init__(self, cldict, sampd):
        self.cldict = cldict
        self.sampd = sampd
        self.mergo = Merger(cldict, sampd)
        self.meto = Metrics(cldict)

        lbwao = None
        lbbmapo = None
        lref_acc_str = sampd.ref_acc_str
        if lref_acc_str != "none":
            lbwao = AlignerBwa(cldict, sampd)
        self.bwao = lbwao
        self.samfco = SamFC(cldict, sampd)
Beispiel #24
0
def merge_addr(terc):
    log_io = io.StringIO()
    logging.basicConfig(level=10, handlers=[logging.StreamHandler(log_io), ])
    addr = get_addresses_terc(terc)
    m = Merger([], addr, terc, "emuia.gugik.gov.pl")
    m.create_index()
    m.merge_addresses()
    return make_response(m.get_incremental_result(log_io), 200)
def merge_addr(terc):
    logIO = io.StringIO()
    logging.basicConfig(level=10, handlers=[logging.StreamHandler(logIO),])
    addr = json.loads(overpass.getAddresses(terc))
    m = Merger([], addr, terc, "emuia.gugik.gov.pl")
    m._create_index()
    m.merge_addresses()
    return make_response(m.get_incremental_result(logIO), 200)
	def do_patch(self, file):
		merger = Merger(file)
		compile = False
		for p in self.patches:
			if p._apply:
				compile = True
				print("Merging program %s for \"%s\"..." % (p.get_program(), p.name()))
				f_m = open(p.get_program(), 'r+b')
				merger.merge(f_m)
				f_m.close()
				print("Program %s for \"%s\" merged.\n" % (p.get_program(), p.name()))

		if compile:
			print("Compiling ...")
			merger.compile()
			print("")

		for p in self.patches:
			if p._apply:
				print("Patching \"%s\"..." % (p.name()))
				file = p.patch(file)
		print("")
		return file
Beispiel #27
0
    def disc_merge(self, cluster, dim, vals, skip=None):
        # reject if union is a superset of anything in
        # rejected_disc_vals
        vals = set(vals)
        vals.update(cluster.discretes.get(dim, ()))
        for subset in self.rejected_disc_vals[dim]:
            if vals.issuperset(subset):
                return None

        merged = Merger.disc_merge(self, cluster, dim, vals, skip)
        if merged:
            merged.c_range = list(self.c_range)
            merged.inf_func = self.learner.create_inf_func(merged)
        return merged
Beispiel #28
0
def get_history(id: ObjectId, num_changes: int = None):
    hist = history.find({'ref': id}).sort('_id', direction=pymongo.DESCENDING)
    curr = data.find_one({'_id': id})

    yield curr

    prev = curr
    count = 0
    merger = Merger()
    for d in hist:
        if num_changes and count == num_changes:
            break

        d['ref_creation_time'] = d['_id'].generation_time
        del d['_id']
        del d['ref']

        l: dict = copy.deepcopy(prev)
        merger.merge_changes(l, d)

        yield l
        prev = l
        count += 1
Beispiel #29
0
    def __init__(self, cldict, sampd):
        self.cldict = cldict
        self.sampd = sampd
        self.mergo = Merger(cldict, sampd)
        self.meto = Metrics(cldict)

        lbwao = None
        lbbmapo = None
        lref_acc_str = sampd.ref_acc_str
        lhost_ref_str = sampd.host_ref_str

        if lref_acc_str != "none":
            lbwao = AlignerBwa(cldict, sampd)
        if lhost_ref_str != "none":
            lbbmapo = AlignerBBMap(cldict, sampd)
        self.bwao = lbwao
        self.bbmapo = lbbmapo
        self.samfco = SamFC(cldict, sampd)
        self.jlco = CounterJL(cldict, sampd)
        print("Created JLCounter object")
Beispiel #30
0
    def __init__(self, args):
        self.config_log_file = args.config_log_file
        
        self.sample_id = args.sample_id
        self.project_id = args.project_id
        self.prefix_set = args.prefix_set
        self.bc_set = args.bc_set

        cldict_d = yaml.load(open(self.config_log_file))
        cldict = DictMap(cldict_d)
        self.cldict = cldict

        sampd = dict()
        sampd['sample_id'] = self.sample_id
        sampd['project_id'] = self.project_id
        sampd['prefix_set'] = self.prefix_set
        sampd['bc_set'] = self.bc_set
        sampd_map = DictMap(sampd) 
        self.sampd = sampd_map
 
        mergo = Merger(cldict, sampd_map)
        self.mergo = mergo
def main():
    parser = argparse.ArgumentParser(
        formatter_class=argparse.RawDescriptionHelpFormatter,
        description="""Merge osm file with address nodes with buildings in specified area as terc code"""
    )
    parser.add_argument('--addr', help='File with address nodes to merge', required=True)
    parser.add_argument('--building', help='File with buildings to merge', required=True)
    parser.add_argument('--output', help='output file with merged data (default: result.osm)')
    parser.add_argument('--terc', help='Teryt TERC code for area processed')
    parser.add_argument('--log-level', help='Set logging level (debug=10, info=20, warning=30, error=40, critical=50), default: 20', dest='log_level', default=20, type=int)

    args = parser.parse_args()

    log_stderr = logging.StreamHandler()
    log_stderr.setLevel(args.log_level)
    logIO = io.StringIO()
    logging.basicConfig(level=10, handlers=[log_stderr, logging.StreamHandler(logIO)])

    if args.output:
        output = open(args.output, "wb")
    else:
        parts = args.input.rsplit('.', 1)
        parts[0] += '-merged'
        output = open('.'.join(parts), "xb")
        print("Output filename: %s" % ('.'.join(parts),))

    data = [OsmAddress.from_soup(x) for x in osm_to_json(lxml.etree.parse(open(args.addr)))['elements']]

    addr = osm_to_json(open(args.building))

    m = Merger(data, addr, args.terc)
    for i in data:
        m._do_merge_create_point(i)
    m.create_index()
    m.merge_addresses()
    output.write(m.get_incremental_result(logIO))
Beispiel #32
0
    def QC_tumor_normal(self):
        # Separate the runs into tumor and normal lists
        normal_runs, tumor_runs = self.getTumor_Normal()

        if self.sample_json['analysis']['settings'][
                'type'] == 'all_tumor_normal':
            # Use the sample_status here to not re-run the QC and to not overwrite run status. The 'sample_status' should be reset to 'pushed' when new runs are pushed..
            #if self.sample_json['sample_status'] != 'pending_merge' and self.sample_json['sample_status'] != 'pending_3x3_review' and self.sample_json['sample_status'] != 'merged':
            # if the user specified the '--pass_fail' option, then run this part still
            if self.sample_json[
                    'sample_status'] == 'pushed' or self.options.pass_fail or self.options.qc_all:
                # QC the normal or tumor runs with each other
                self.QC_runs(normal_runs, 'normal_')
                self.QC_runs(tumor_runs, 'tumor_')
                # now QC the tumor and normal runs together.
                self.QC_normal_tumor_runs(normal_runs, tumor_runs)

            # make the merger
            merger = Merger(self.sample_json, self.options.recalc_3x3_tables)
            # Check to see if the normal runs are ready to be merged.
            self.sample_json, merge_normal = merger.check_merge(
                normal_runs, 'Normal/', 'normal_')
            if merge_normal == True:
                # merge the normal and/or tumor runs. Will only merge the passing runs with each other.
                self.sample_json = merger.merge_runs('normal', 'Normal_',
                                                     'normal_')

            # Check to see if the tumor runs are ready to be merged.
            self.sample_json, merge_tumor = merger.check_merge(
                tumor_runs, 'Tumor/', 'tumor_')
            if merge_tumor == True:
                self.sample_json = merger.merge_runs('tumor', 'Tumor_',
                                                     'tumor_')

            # If any runs were merged, QC them. If there are only 1 normal and tumor run, they won't be QCd again.
            #if normal_merge_dir != '' or tumor_merge_dir != '' or (len(normal_passing_bams) == 1 and len(tumor_passing_bams) == 1):
            # now QC the tumor and normal merged bams together if both normal and tumor runs are ready.
            # To only QC all for the actual merged runs (PNET), change the 'final' part to 'merged'.
            # The 'final_normal_json' and 'final_tumor_json' flags are set by merger.py in the function check_merge, line 157
            #if (merge_normal or merge_tumor) and ('merged_normal_json' in self.sample_json and 'merged_tumor_json' in self.sample_json):
            if 'final_normal_json' in self.sample_json and 'final_tumor_json' in self.sample_json:
                self.sample_json, qc_json = self.qc_run.QC_2Runs(
                    self.sample_json, self.sample_json['final_normal_json'],
                    self.sample_json['final_tumor_json'], 'normal_', 'tumor_',
                    '_merged')
                self.sample_json, merged_perc_avail_bases = self.qc_run.update_3x3_runs_status(
                    self.sample_json, self.sample_json['final_normal_json'],
                    self.sample_json['final_tumor_json'], qc_json)
                # update the merged run status
                merger.update_merged_run_status(
                    self.sample_json['final_normal_json'],
                    merged_perc_avail_bases)
                merger.update_merged_run_status(
                    self.sample_json['final_tumor_json'],
                    merged_perc_avail_bases)

                # cleanup the individual run bam files
                if merged_perc_avail_bases > .9:
                    final_qc_dir = "%s/all%svs%s" % (
                        self.sample_json['qc_folder'],
                        json.load(open(self.sample_json['final_normal_json']))
                        ['run_name'],
                        json.load(open(
                            self.sample_json['final_tumor_json']))['run_name'])
                    # annotate the final somatic variants
                    command = "bash %s/Somatic_Variants/somatic_variants.sh %s %s %s" % (
                        self.sample_json['analysis']['software_directory'],
                        final_qc_dir, self.sample_json['sample_name'],
                        self.sample_json['analysis']['software_directory'])
                    if runCommandLine(command) != 0:
                        sys.stderr.write("ERROR: somatic annotation failed!\n")

                    # Cleanup the PTRIM.bam and chr bam files after all of the QC is done.
                    # are there any other files to clean up?
                    self.cleanup_sample.cleanup_runs(
                        self.sample_json['runs'],
                        self.sample_json['analysis']['settings']['cleanup'],
                        self.no_errors)
                    #self.cleanup_sample.delete_runs(runs, self.sample_json['analysis']['settings']['cleanup'], self.no_errors)

                    # Cleanup after the merging QC is done.
                    self.cleanup_sample.cleanup_runs([
                        self.sample_json['final_normal_json'],
                        self.sample_json['final_tumor_json']
                    ], self.sample_json['analysis']['settings']['cleanup'],
                                                     self.no_errors)

                    # Set the sample_status
                    self.sample_json['sample_status'] = 'merged_pass'
                else:
                    self.sample_json[
                        'sample_status'] = 'awaiting_more_sequencing'
Beispiel #33
0
    def QC_germline(self):
        # Use the sample_status here to not re-run the QC and to not overwrite run status. The 'sample_status' should be reset to 'pushed' when new runs are pushed..
        #if self.sample_json['sample_status'] != 'pending_merge' and self.sample_json['sample_status'] != 'pending_3x3_review' and self.sample_json['sample_status'] != 'merged':
        # if the user specified the '--pass_fail' option, then run this part still
        if self.sample_json[
                'sample_status'] == 'pushed' or self.options.pass_fail or self.options.qc_all:
            # QC the normal runs with each other
            self.QC_runs(self.sample_json['runs'])

        # what if there is only one run that passes all of the metrics? It should be marked as the 'final_json' and have the 'pass_fail_merged' flag marked as pass.
        # make the merger
        merger = Merger(self.sample_json, self.options.recalc_3x3_tables)
        # Check to see if the normal runs are ready to be merged.
        self.sample_json, merge = merger.check_merge(self.sample_json['runs'])
        if merge != True:
            if 'final_json' in self.sample_json:
                # update the final run status
                merger.update_merged_run_status(self.sample_json['final_json'])
        elif merge == True:
            # merge the normal and/or tumor runs. Will only merge the passing runs with each other.
            self.sample_json = merger.merge_runs('germline')

            # update the merged run status
            merger.update_merged_run_status(self.sample_json['merged_json'])

            if json.load(open(self.sample_json['merged_json'])
                         )['pass_fail_merged_status'] == 'pass':
                # Set the sample_status
                self.sample_json['sample_status'] = 'merged_pass'
                # cleanup the individual run bam files
                self.cleanup_sample.cleanup_runs(
                    self.sample_json['runs'],
                    self.sample_json['analysis']['settings']['cleanup'],
                    self.no_errors)
                # Cleanup the merged dir
                self.cleanup_sample.cleanup_runs(
                    [self.sample_json['merged_json']],
                    self.sample_json['analysis']['settings']['cleanup'],
                    self.no_errors)
            else:
                self.sample_json['sample_status'] = 'awaiting_more_sequencing'

        # copy the final run's VCF file to the final_dir if it passes the "merged" coverage flag
        if 'final_json' in self.sample_json:
            final_json = json.load(open(self.sample_json['final_json']))
            if final_json['pass_fail_merged_status'] == 'pass':
                final_vcf = glob.glob("%s/*.vcf" % final_json['run_folder'])[0]
                final_project_dir = "/home/ionadmin/jeff/%s_Final_VCFs" % (
                    self.sample_json['project'])
                print "copying %s to %s" % (final_vcf, final_project_dir)
                # check to make sure the final dir exists.
                if not os.path.isdir(final_project_dir):
                    os.mkdir(final_project_dir)
                shutil.copy(
                    final_vcf, "%s/%s.vcf" %
                    (final_project_dir, self.sample_json['sample_name']))
                # now push the sample to s3 storage
                if self.sample_json['project'] == 'Einstein':
                    print "pushing %s to amazon s3 storage" % self.sample_json[
                        'sample_name']
                    self.push_sample_to_s3(final_json)
Beispiel #34
0
    def test_adding_layers(self):

        m = Merger()
        l = Layer("1")
        m.addLayer(l)
        self.assertEqual(m.layers, [l])


        m = Merger()
        l = Layer("1")
        l2 = Layer("1")
        m.addLayer(l)
        m.addLayer(l2)

        self.assertEqual(m.layers, [l2])

        m.delLayer(l2)
        m.delLayer(l)
Beispiel #35
0
if __name__ == "__main__":

    if len(sys.argv) != 2:
        print "usage: python builder.py [filename]"

    archive = sys.argv[1]

    fp = FileParser()
    fp.extract(archive, "tmp")
    extract_all(archive)
    # get filename from archive: nz.tar => tmp/nz_merged
    files = fp.getFiles("tmp/" + archive.split(".")[0] + "_merged")

    t = time.time()
    r = IndexBuilder(files, UrlTable(), CParser(), Pipeline())
    printf("parsing %d files:" % len(r.files))
    r.process()

    print "\nparsed %d pages in %d files for %f seconds" % (r.page_id, r.id, time.time() - t)
    print "avarage %f second for parsing each files" % ((time.time() - t) / r.id)
    print "started to build revert_index: "

    t = time.time()
    m = Merger("tmp")
    m.merge()
    print "\nbuild reverted index for %d records in %f seconds" % (r.uid, (time.time() - t))
    cleanup("tmp")

    build_index("rindex")
Beispiel #36
0
class SAM:
    def __init__(self, visualizer=None, speaker_recognition=False):

        self.merger_to_main_queue = Queue(maxsize=1000)  # very roughly 30sec
        self.merger = Merger(self.merger_to_main_queue)
        if visualizer is None:
            self.visualization = False
        else:
            self.visualization = True
            self.main_to_vis_queue = Queue(maxsize=50)
            self.visualizer = visualizer(self.main_to_vis_queue)

        self.speakers = {}
        self.num_speakers = 0
        self.stt = T2t_stt()
        self.speaker_recognition = speaker_recognition
        # if self.speaker_recognition:
        #     self.sr = Speaker_recognition()
        self.text_queue = mult_Queue()
        self.bing_allowed = False

    def handle_service(self, req):
        rospy.loginfo("entered handle service")
        self.visualizer.idle = False
        # msg = Int32()
        # msg.data = 0
        # self.ledmode_pub.publish(msg)
        queue = mult_Queue()
        self.bing_allowed = True
        p = Process(target=self.stt_subprocess, args=(queue, ))
        p.start()
        p.join()
        # msg = msg_Empty()
        # self.ledfreeze_pub.publish(msg)
        # self.visualizer.idle = True
        self.bing_allowed = False
        return queue.get()

    def stt_subprocess(self, q):
        # clear the text queue
        # rospy.loginfo("clear the text queue")
        # while not self.text_queue.empty():
        #     rospy.loginfo("got an item from the queue ->" + self.text_queue.get())

        # wait for the next text to arrive
        rospy.loginfo("going to wait for the text_queue to be filled again")
        rate = rospy.Rate(1)
        while self.text_queue.empty() and not rospy.is_shutdown():
            rospy.loginfo("still waiting, current length : " +
                          str(self.text_queue.qsize()))
            rate.sleep()

        # put it into the return queue
        rospy.loginfo("got one and put it into the dedicated queue")
        q.put(self.text_queue.get())

    def mode_callback(self, msg):
        if msg.mode == 2:
            self.visualizer.heartbeat = True

    def freeze_callback(self, msg):
        self.visualizer.heartbeat = False

    def run(self):
        self.merger.start()

        if self.visualization:
            self.visualizer.start()

        recording_id_odas = [0, 0, 0, 0]
        last_recording_id_odas = [0, 0, 0, 0]

        recordings = {}
        # request to speaker recognition waiting to be answered, key is the id,
        # value is the queue in which the result will be stored
        sr_requests = {}

        # kevins ros changes
        pub = rospy.Publisher('/roboy/cognition/sam/output',
                              String,
                              queue_size=10)
        rospy.Subscriber("/roboy/control/matrix/leds/mode", ControlLeds,
                         self.mode_callback)
        rospy.Subscriber("/roboy/control/matrix/leds/freeze", msg_Empty,
                         self.freeze_callback)
        # s = rospy.Service('/roboy/cognition/speech/recognition', RecognizeSpeech, self.handle_service)
        # self.ledmode_pub = rospy.Publisher("/roboy/control/matrix/leds/mode/simple", Int32, queue_size=3)
        # self.ledoff_pub = rospy.Publisher('/roboy/control/matrix/leds/off', msg_Empty, queue_size=10)
        # self.ledfreeze_pub = rospy.Publisher("/roboy/control/matrix/leds/freeze", msg_Empty, queue_size=1)
        # self.ledpoint_pub = rospy.Publisher("/roboy/control/matrix/leds/point", Int32, queue_size=1)
        rospy.init_node("SAM", anonymous=True)

        # operation average
        angle_list = []

        while self.merger.is_alive() and not rospy.is_shutdown():

            # we do ask for the next data block
            # maybe this is the place where i can insert a call and replace the while loop

            # wait for/get next data
            try:
                next_data = self.merger_to_main_queue.get(block=True,
                                                          timeout=1)
            except q_Empty:
                continue  # restart loop, but check again if we maybe got a stop signal

            cid = next_data['id_info']
            caudio = next_data['audio_data']

            ############################################################################################
            # this part separates the 4 streams and manages the ones where currently audio is being recorded
            #########################################################################################
            # cid[i] = [id, x, y, z, activity]
            for i in range(len(cid)):  # len=4

                recording_id_odas[i] = cid[i][0]

                if recording_id_odas[i] > 0:
                    if recording_id_odas[i] == last_recording_id_odas[i]:
                        # same person continues speaking
                        recordings[recording_id_odas[i]].audio = np.append(
                            recordings[recording_id_odas[i]].audio, caudio[i])
                        recordings[recording_id_odas[i]].currentpos = [
                            cid[i][1], cid[i][2], cid[i][3]
                        ]

                    else:
                        # a person started speaking
                        recordings[recording_id_odas[i]] = Recording(
                            recording_id_odas[i],
                            [cid[i][1], cid[i][2], cid[i][3]])
                        recordings[recording_id_odas[i]].audio = np.append(
                            recordings[recording_id_odas[i]].audio, caudio[i])

                        # if a different person was speaking before, he is now done
                        if last_recording_id_odas[i] > 0:
                            recordings[
                                last_recording_id_odas[i]].stopped = True
                elif recording_id_odas[
                        i] == 0 and last_recording_id_odas[i] > 0:
                    # if a different person was speaking before, he is now done
                    recordings[last_recording_id_odas[i]].stopped = True

                last_recording_id_odas[i] = recording_id_odas[i]

            ##########################################################
            # check if we got any answers from sr (speaker recognition) in the meantime
            #############################################################
            to_delete_req = []
            for rec_id, req in sr_requests.iteritems():
                try:
                    # sr_id: -99 means new speaker
                    # certainty between 0-10
                    certainty = 0
                    preliminary_id, sr_id, certainty = req.get(block=False)

                    # Fuse info of speaker recognition on localization together
                    # First the best case, both agree on an is/new speker
                    if sr_id == recordings[rec_id].preliminary_speaker_id:
                        # both agree, thats nice
                        recordings[rec_id].final_speaker_id = recordings[
                            rec_id].preliminary_speaker_id
                        recordings[rec_id].send_to_trainer = True
                    elif recordings[
                            rec_id].created_new_speaker and sr_id == -99:
                        # both agree, that this is a new speaker
                        output_string = "both agree that rec %d is new speaker %d" % (
                            rec_id, recordings[rec_id].preliminary_speaker_id)
                        rospy.logdebug(output_string)
                        recordings[rec_id].final_speaker_id = recordings[
                            rec_id].preliminary_speaker_id
                        recordings[rec_id].send_to_trainer = True
                    else:

                        # Now come the harder parts.
                        if certainty < 1:
                            # if speaker recognition is unsure we rely on localization
                            recordings[rec_id].final_speaker_id = recordings[
                                rec_id].preliminary_speaker_id
                        elif certainty > 8:
                            # sr is super sure, we trust it
                            recordings[rec_id].final_speaker_id = sr_id
                            recordings[rec_id].sr_changed_speaker = True
                        else:
                            # check the angle the the speaker sr suggested, and depending on the certainty decide
                            # go through the list of speaker angles and find the one one which sr suggests
                            found = False
                            for (oth_id, angl
                                 ) in recordings[rec_id].angles_to_speakers:
                                if oth_id == sr_id:
                                    # the further we are away the shurer sr has to be
                                    if certainty * 20 > angl:
                                        recordings[
                                            rec_id].final_speaker_id = sr_id
                                        recordings[
                                            rec_id].sr_changed_speaker = True
                                    else:
                                        recordings[
                                            rec_id].final_speaker_id = recordings[
                                                rec_id].preliminary_speaker_id
                                    found = True
                                    break
                            if not found:
                                # this shouldn't happen
                                output_string = "Speaker recognition suggestested id {} for recording {}," \
                                                " which doesn't exist".format(sr_id, rec_id)
                                rospy.logerr(output_string)
                                recordings[
                                    rec_id].final_speaker_id = recordings[
                                        rec_id].preliminary_speaker_id

                    output_string = "response for req %d, results is %d, certanty %d" % (
                        rec_id, sr_id, certainty)
                    rospy.logdebug(output_string)
                    recordings[rec_id].is_back_from_sr = True
                    to_delete_req.append(rec_id)

                except q_Empty:
                    if time.time() - recordings[
                            rec_id].time_sent_to_sr > 3:  # no response from sr for 3 sec -> timeout
                        # print("no response for request %d in 3 sec -> timeout" % (rec_id))
                        recordings[rec_id].final_speaker_id = recordings[
                            rec_id].preliminary_speaker_id
                        recordings[rec_id].is_back_from_sr = True
                        to_delete_req.append(rec_id)

            for req in to_delete_req:
                del sr_requests[req]

            ##################################################################################
            # here we go through our recordings and handle them based on their current status
            ####################################################################################
            to_delete = []

            rec_info_to_vis = []

            for rec_id, rec in recordings.iteritems():
                if self.visualization and not rec.stopped:
                    # convert audio to energy and append it to the tuple
                    # Energy is the root mean square of the signal
                    # E = sqrt(sum(s[n]^2)/N)
                    curr_energy = np.sqrt(np.mean(np.square(rec.audio.data)))
                    if not rec.stopped:
                        rec_info_to_vis.append([
                            rec_id, rec.currentpos[0], rec.currentpos[1],
                            rec.currentpos[2], 200, curr_energy
                        ])  # 200 is the size of the blob
                    else:
                        rec_info_to_vis.append([
                            rec_id, rec.currentpos[0], rec.currentpos[1],
                            rec.currentpos[2], 50, curr_energy
                        ])

                if rec.new:
                    output_string = "new recording " + str(rec_id)
                    rospy.loginfo(output_string)
                    # get angles to all known speakers
                    rec.get_angles_to_all_speakers(self.speakers, rec.startpos)

                    # if it is wihthin a certain range to a known speaker, assign it to him
                    if len(
                            self.speakers
                    ) > 0 and rec.angles_to_speakers[0][1] < 35:  # degree
                        output_string = "preliminary assigning recording %d to speaker %d, angle is %d" % (
                            rec_id, rec.angles_to_speakers[0][0],
                            rec.angles_to_speakers[0][1])
                        rospy.loginfo(output_string)
                        rec.preliminary_speaker_id = rec.angles_to_speakers[0][
                            0]
                        rec.final_speaker_id = rec.preliminary_speaker_id  # this will be overwritten later

                    else:
                        # create a new speaker
                        self.num_speakers += 1
                        new_id = self.num_speakers
                        self.speakers[new_id] = Speaker(new_id, rec.startpos)
                        rec.preliminary_speaker_id = new_id
                        rec.final_speaker_id = rec.preliminary_speaker_id  # this will be overwritten later
                        rec.created_new_speaker = True
                        closest_ang = -999
                        if len(rec.angles_to_speakers) > 0:
                            closest_ang = rec.angles_to_speakers[0][1]
                        output_string = "creating new speaker %d for recording %d, closest angle is %d" % (
                            new_id, rec_id, closest_ang)
                        rospy.logdebug(output_string)

                        if self.num_speakers == 1:
                            rec.send_to_trainer = True

                    rec.new = False

                # elif self.speaker_recognition and (not rec.was_sent_sr and rec.audio.shape[
                #     0] > 16000 * 3):  # its longer than 3 sec, time to send it to speaker recognition
                #     sr_requests[rec_id] = Queue(maxsize=1)
                #     self.sr.test(rec.audio, rec.preliminary_speaker_id, sr_requests[rec_id])
                #     rec.was_sent_sr = True
                #     rec.time_sent_to_sr = time.time()

                elif rec.stopped:
                    # speaker finished, handle this
                    if not rec.alldone:
                        if rec.audio.shape[
                                0] < 16000 * 0.4:  # everything shorter than this we simply discard
                            output_string = "recording %d was too short, discarding" % (
                                rec_id)
                            print output_string
                            rospy.loginfo(output_string)
                            if rec.created_new_speaker:
                                del self.speakers[rec.preliminary_speaker_id]
                                output_string = "thus also deleting speaker" + str(
                                    rec.preliminary_speaker_id)
                                rospy.logdebug(output_string)
                            rec.alldone = True
                    if not rec.alldone:
                        if (rec.was_sent_sr and
                                rec.is_back_from_sr) or (not rec.was_sent_sr):
                            if not rec.was_sent_sr:
                                # it seems like this has been to short to be sent to
                                rec.final_speaker_id = rec.preliminary_speaker_id
                            self.speakers[
                                rec.final_speaker_id].pos = rec.currentpos

                            if rec.created_new_speaker and rec.sr_changed_speaker:
                                try:
                                    del self.speakers[
                                        rec.preliminary_speaker_id]
                                except:
                                    output_string = "Error deleting preliminary speaker " + str(
                                        rec.preliminary_speaker_id)
                                    print output_string
                                    rospy.logerr(output_string)

                            # TODO:
                            # send to speech to text
                            if self.bing_allowed:
                                text = self.stt.get_text(rec.audio)
                                # wavfile.write(text.encode('utf-8') + ".wav", 16000, rec.audio.data)
                            else:
                                text = "bing is not allowed yet"
                            # output_string = "Speaker {}: ".format(rec.final_speaker_id) + text.encode('utf-8')
                            output_string = text.encode('utf-8')
                            rospy.loginfo(output_string)
                            pub.publish(output_string)

                            if self.bing_allowed:
                                self.text_queue.put(output_string)
                                rospy.logdebug("text_queue lenght in main: " +
                                               str(self.text_queue.qsize()))

                            # send this to trainer
                            # if self.speaker_recognition and rec.send_to_trainer:
                            #     self.sr.train(rec.final_speaker_id, rec.audio)
                            #     output_string = "sending recording %d to trainer" % (rec_id)
                            #     rospy.logdebug(output_string)

                            output_string = "succesfully handeld recording " + str(
                                rec_id)
                            rospy.logdebug(output_string)
                            rec.alldone = True
                        else:
                            pass  # wait for the response of sr

                if rec.alldone:
                    to_delete.append(rec_id)

            for rec_id in to_delete:
                del recordings[rec_id]

            if self.visualization:
                try:
                    self.main_to_vis_queue.put(
                        {
                            'speakers': self.speakers,
                            'recordings': rec_info_to_vis
                        },
                        block=False)

                except Full:
                    # print("couldn't put data into visualization queue, its full")
                    pass

            # ---------------------------------------------------------------------------------------------------
            # new doa to led addon
            # print
            # print "------------------------------------"
            # print "speakers: "
            # print self.speakers
            # print "rec_info_to_vis: "
            # operation average
            # if len(rec_info_to_vis) > 0 and not self.bing_allowed:
            #     # print "0 -> ", rec_info_to_vis[0][0]
            #     # print "1 -> ", rec_info_to_vis[0][1]
            #     # print "2 -> ", rec_info_to_vis[0][2]
            #     # print "3 -> ", rec_info_to_vis[0][3]
            #     # print "4 -> ", rec_info_to_vis[0][4]
            #     angle_list.append(rec_info_to_vis[0][1])
            #     if len(angle_list) >= 10:
            #         publish_point_left_right(self.ledpoint_pub, sum(angle_list)/len(angle_list))
            #         angle_list = []
            # else:
            #     print "Empty dude"
            # print "------------------------------------"
            # print
            # publish_point(self.ledpoint_pub, rec_info_to_vis[1])
            # ---------------------------------------------------------------------------------------------------

        output_string = "SAM is done."
        print output_string
        rospy.loginfo(output_string)
        self.merger.stop()
        if self.visualization:
            self.visualizer.stop()
        rospy.signal_shutdown("SAM is done.")
Beispiel #37
0
    def my_fit(self, Xs, y, time_ramain, X_test):
        np.random.seed(CONSTANT.SEED)

        split = CONSTANT.SPLIT

        self.split = split

        log(f'split {split}')

        if split == -1:
            config = Config(time.time(), self.info['time_budget'])

            X_test.index = -X_test.index - 1

            main_shape = Xs[CONSTANT.MAIN_TABLE_NAME].shape[0]
            main_max_shape = 2888888
            main_min_shape = min(main_shape, 100000)

            test_shape = X_test.shape[0]
            max_accept_shape = 3999999

            if main_shape + test_shape > max_accept_shape:
                sample_main_shape = max_accept_shape - test_shape
                if sample_main_shape > main_max_shape:
                    sample_main_shape = main_max_shape
                if sample_main_shape < main_min_shape:
                    sample_main_shape = main_min_shape
                log(f'start sample main table. origin main shape {main_shape} test shape {test_shape} sample rows num {sample_main_shape}'
                    )
                if 'time_col' in self.info:
                    key_time_col = self.info['time_col']
                    if key_time_col in Xs[CONSTANT.MAIN_TABLE_NAME].columns:
                        Xs[CONSTANT.MAIN_TABLE_NAME].sort_values(
                            by=key_time_col, inplace=True)
                Xs[CONSTANT.MAIN_TABLE_NAME] = Xs[
                    CONSTANT.MAIN_TABLE_NAME].iloc[-sample_main_shape:]
                gc.collect()

            Xs[CONSTANT.MAIN_TABLE_NAME] = pd.concat(
                [Xs[CONSTANT.MAIN_TABLE_NAME], X_test])

            X_test.drop(X_test.columns, axis=1, inplace=True)
            gc.collect()

            graph = Graph(self.info, Xs)
            graph.sort_tables()
            train_index = Xs[CONSTANT.MAIN_TABLE_NAME].index[
                Xs[CONSTANT.MAIN_TABLE_NAME].index >= 0]
            y = y.loc[train_index]
            test_index = Xs[CONSTANT.MAIN_TABLE_NAME].index[
                Xs[CONSTANT.MAIN_TABLE_NAME].index < 0]

            graph.preprocess_fit_transform()
            gc.collect()

            merge_feat_pipeline = DeafultMergeFeatPipeline()
            merger = Merger(merge_feat_pipeline)

            merger.merge_table(graph)
            main_table = merger.merge_to_main_fit_transform(graph)
            self.release_tables(Xs, graph)
            del merger
            del graph
            gc.collect()

            feat_pipeline = DefaultFeatPipeline()
            feat_engine = FeatEngine(feat_pipeline, config)
            feat_engine.fit_transform_order1(main_table, y)

            sample_for_combine_features = True

            if sample_for_combine_features:
                main_data = main_table.data
                train_data = main_data.loc[main_data.index >= 0]

                del main_data

                sample_num = CONSTANT.SAMPLE_NUM
                train_shape = train_data.shape

                if train_shape[0] <= sample_num:
                    sample_for_combine_features = False
                else:
                    data_tail_new = train_data.iloc[-sample_num:]

                    gc.collect()

                    y_tail_new = y.loc[data_tail_new.index]

                    table_tail_new = copy.deepcopy(main_table)
                    table_tail_new.data = data_tail_new

                    del data_tail_new
                    gc.collect()

                    feat_engine.fit_transform_all_order2(table_tail_new,
                                                         y_tail_new,
                                                         sample=True)
                    feat_engine.fit_transform_keys_order2(table_tail_new,
                                                          y_tail_new,
                                                          sample=True)

                    del table_tail_new, y_tail_new
                    gc.collect()

                    feat_engine.fit_transform_all_order2(main_table,
                                                         y,
                                                         selection=False)
                    feat_engine.fit_transform_keys_order2(main_table,
                                                          y,
                                                          selection=False)

                    feat_engine.fit_transform_post_order1(main_table, y)

            if not sample_for_combine_features:
                gc.collect()

                feat_engine.fit_transform_all_order2(main_table, y)
                feat_engine.fit_transform_keys_order2(main_table, y)

                feat_engine.fit_transform_keys_order3(main_table, y)
                feat_engine.fit_transform_post_order1(main_table, y)

            del feat_engine
            gc.collect()

            X_test = main_table.data.loc[test_index]
            main_table.data = main_table.data.loc[train_index]

            gc.collect()

            test_table = copy.deepcopy(main_table)
            test_table.data = X_test
            self.test_table = test_table
            len_test = X_test.shape[0]
            gc.collect()

            feat_engine = FeatEngine(feat_pipeline, config)
            feat_engine.fit_transform_merge_order1(main_table, y)
            self.feat_engine = feat_engine

            feat_output = FeatOutput()
            self.feat_output = feat_output
            X, y, categories = feat_output.final_fit_transform_output(
                main_table, y)

            del main_table
            gc.collect()

            lgb = AutoLGB()

            lgb.param_compute(X, y, categories, config)
            X_train, y_train, X_test, y_test = time_train_test_split(
                X, y, test_rate=0.2)

            lgb.param_opt_new(X_train, y_train, X_test, y_test, categories)

            gc.collect()

            del X_train, y_train, X_test, y_test

            gc.collect()

            X, y = self.shuffle(X, y, 2019)
            gc.collect()

            lgb.ensemble_train(X, y, categories, config, len_test)

            gc.collect()

            importances = lgb.get_ensemble_importances()

            self.model = lgb
            del X, y

        elif split == -2:

            config = Config(time.time(), self.info['time_budget'])

            Xs[CONSTANT.MAIN_TABLE_NAME] = pd.concat([
                Xs[CONSTANT.MAIN_TABLE_NAME],
            ])

            gc.collect()

            graph = Graph(self.info, Xs)
            graph.sort_tables()
            train_index = Xs[CONSTANT.MAIN_TABLE_NAME].index[
                Xs[CONSTANT.MAIN_TABLE_NAME].index >= 0]
            y = y.loc[train_index]

            graph.preprocess_fit_transform()
            gc.collect()

            merge_feat_pipeline = DeafultMergeFeatPipeline()
            merger = Merger(merge_feat_pipeline)

            merger.merge_table(graph)
            main_table = merger.merge_to_main_fit_transform(graph)
            self.release_tables(Xs, graph)
            del merger
            del graph
            gc.collect()

            feat_pipeline = DefaultFeatPipeline()
            feat_engine = FeatEngine(feat_pipeline, config)
            feat_engine.fit_transform_order1(main_table, y)

            sample_for_combine_features = True

            if sample_for_combine_features:
                main_data = main_table.data
                train_data = main_data.loc[main_data.index >= 0]

                del main_data

                sample_num = CONSTANT.SAMPLE_NUM
                train_shape = train_data.shape

                if train_shape[0] <= sample_num:
                    sample_for_combine_features = False
                else:
                    data_tail_new = train_data.iloc[-sample_num:]

                    gc.collect()
                    log(f'sample data shape {data_tail_new.shape}')

                    y_tail_new = y.loc[data_tail_new.index]

                    table_tail_new = copy.deepcopy(main_table)
                    table_tail_new.data = data_tail_new

                    del data_tail_new
                    gc.collect()

                    feat_engine.fit_transform_all_order2(table_tail_new,
                                                         y_tail_new,
                                                         sample=True)
                    feat_engine.fit_transform_keys_order2(table_tail_new,
                                                          y_tail_new,
                                                          sample=True)

                    del table_tail_new, y_tail_new
                    gc.collect()

                    feat_engine.fit_transform_all_order2(main_table,
                                                         y,
                                                         selection=False)
                    feat_engine.fit_transform_keys_order2(main_table,
                                                          y,
                                                          selection=False)
                    feat_engine.fit_transform_post_order1(main_table, y)

            if not sample_for_combine_features:
                gc.collect()

                feat_engine.fit_transform_all_order2(main_table, y)
                feat_engine.fit_transform_keys_order2(main_table, y)
                feat_engine.fit_transform_keys_order3(main_table, y)
                feat_engine.fit_transform_post_order1(main_table, y)

            del feat_engine
            gc.collect()

            main_table.data = main_table.data.loc[train_index]

            gc.collect()

            def split_table(table, y):
                X = table.data
                X_train, y_train, X_test, y_test = time_train_test_split(
                    X, y, shuffle=False, test_rate=0.2)
                table1 = copy.deepcopy(table)
                table1.data = X_train
                table2 = copy.deepcopy(table)
                table2.data = X_test
                return table1, y_train, table2, y_test

            table1, y_train, table2, y_test = split_table(main_table, y)

            feat_engine = FeatEngine(feat_pipeline, config)
            feat_engine.fit_transform_merge_order1(table1, y_train)
            self.feat_engine = feat_engine

            feat_output = FeatOutput()
            self.feat_output = feat_output

            X_train, y_train, categories = feat_output.fit_transform_output(
                table1, y_train)

            gc.collect()
            self.feat_engine.transform_merge_order1(table2)
            X_test = self.feat_output.transform_output(table2)

            lgb = AutoLGB()

            lgb.param_compute(X_train, y_train, categories, config)

            lgb.param_opt_new(X_train, y_train, X_test, y_test, categories)

            len_test = X_test.shape[0]

            lgb.ensemble_train(X_train, y_train, categories, config, len_test)
            gc.collect()

            pred, pred0 = lgb.ensemble_predict_test(X_test)

            auc = roc_auc_score(y_test, pred0)
            print('source AUC:', auc)

            auc = roc_auc_score(y_test, pred)
            Model.ensemble_auc.append(auc)
            print('ensemble AUC:', auc)

            importances = lgb.get_ensemble_importances()

            self.model = lgb

            del X_train, y_train, X_test, y_test
            gc.collect()

        paths = os.path.join(feature_importance_path, version)
        if not os.path.exists(paths):
            os.makedirs(paths)
        importances.to_csv(os.path.join(
            paths, '{}_importances.csv'.format(
                datetime.now().strftime('%Y%m%d%H%M%S'))),
                           index=False)
Beispiel #38
0
class Application(Frame):
    def __init__(self, master = None):
        Frame.__init__(self, master)
        
        self.opend_file = None
        background = 'white'
        text_background = '#EEE'

        master.configure(bg = background)
        master.minsize(600, 600)
        master.title('代码合并器')

        master.rowconfigure(1, weight = 1)
        for col in range(10):
            self.master.columnconfigure(col, weight = 1)
        
        self.label_src = Label(master, text = '原始码', bg = background)
        self.label_src.grid(row = 0, column = 0, rowspan = 1, columnspan = 5, sticky = W+S, padx = 10, pady = 10)

        self.text_src = Text(master, bg = text_background)
        self.text_src.grid(row = 1, column = 0, rowspan = 1, columnspan = 5, sticky = W+E+N+S, padx = (10, 5), pady = 0)

        self.label_dst = Label(master, text = '精简码', bg = background)
        self.label_dst.grid(row = 0, column = 5, rowspan = 1, columnspan = 5, sticky = W+S, padx = 5, pady = 10)

        self.text_dst = Text(master, bg = text_background)
        # self.text_dst.config(state = 'disable')
        self.text_dst.grid(row = 1, column = 5, rowspan = 1, columnspan = 5, sticky = W+E+N+S, padx = (5, 10), pady = 0)

        self.button_open = Button(master, text='导入', width = '10', bg = background, command = self.open)
        self.button_open.grid(row = 2, column = 2, rowspan = 1, columnspan = 2, sticky = N+S, pady = 10)

        self.button_merge = Button(master, text='合并', width = '10', bg = background, command = self.merge)
        self.button_merge.grid(row = 2, column = 4, rowspan = 1, columnspan = 2, sticky = N+S, pady = 10)

        self.button_save = Button(master, text='导出', width = '10', bg = background, command = self.save)
        self.button_save.grid(row = 2, column = 6, rowspan = 1, columnspan = 2, sticky = N+S, pady = 10)

    def open(self):

        self.openfile = tkFileDialog.askopenfile(mode = 'r', defaultextension=".txt")
        text = self.openfile.read()
        print 'File loaded.'
        print text
        self.text_src.delete(0.0, END)
        self.text_src.insert(END, text)

    def merge(self):
        # self.text_dst.config(state = 'normal')

        text = self.text_src.get('1.0', END)
        # print text.encode('utf-8')
        codes2num = decode(text)
        # print codes2num

        self.merger = Merger(codes2num)

        self.text_dst.delete(0.0, END)

        result_text = ''
        for k in range(10, 3, -1):
            result_text += '最大长度' + str(k) + ' '
            result_text += encode(self.merger.merge_result(k))
            # print result_text

        self.text_dst.insert(END, result_text)
        # self.text_dst.config(state = 'disable')

    def save(self):

        self.savefile = tkFileDialog.asksaveasfile(mode = 'w', defaultextension=".txt")
        text = self.text_dst.get(0.0, END)
        # print text.encode('utf-8')
        self.savefile.write(text.encode('utf-8'))
        self.savefile.close()
Beispiel #39
0
from merger import Merger

if __name__ == '__main__':
    m = Merger('file_one.txt', 'file_two.txt')
    m.run()
Beispiel #40
0
    def __call__(self, full_table, bad_tables, good_tables, **kwargs):
        """
        table has been trimmed of extraneous columns.
        """
        self.setup_tables(full_table, bad_tables, good_tables, **kwargs)

        self.SCORE_ID = add_meta_column(
            chain(self.bad_tables, self.good_tables), SCORE_VAR)
        self.CLASS_ID = add_meta_column(chain(self.bad_tables,
                                              self.good_tables),
                                        "INFCLASS",
                                        vals=['0', '1'])

        start = time.time()
        self.compute_perrow_influences(self.bad_tables, self.bad_err_funcs)
        self.compute_perrow_influences(self.good_tables, self.good_err_funcs)
        self.cost_compute_inf = time.time() - start

        start = time.time()
        if self.tree_alg == 'c45':
            table, rules = self.c45_rules()
        elif self.tree_alg == 'or':
            table, rules = self.orange_dt_rules()
        elif self.tree_alg == 'dt':
            table, rules = self.sk_dt_rules(max_depth=12)
        elif self.tree_alg == 'rt':
            table, rules = self.sk_rt_rules(max_depth=12)
        else:
            _logger.warn(
                "unknown NDT algorithm %s.  Defaulting to regression tree",
                self.tree_alg)
            table, rules = self.sk_rt_rules(max_depth=12)
        self.cost_learn = time.time() - start

        #
        # ok now convert rules to clusters
        #

        _logger.debug("got %d rules", len(rules))
        fill_in_rules(rules, table, cols=self.cols)

        self.cost_learn = time.time() - start

        clusters = [Cluster.from_rule(rule, self.cols) for rule in rules]
        for cluster in clusters:
            cluster.error = self.influence_cluster(cluster)
        clusters = filter_bad_clusters(clusters)
        clusters.sort(key=lambda c: c.error, reverse=True)
        print '\n'.join(map(str, clusters[:5]))

        self.all_clusters = self.final_clusters = clusters
        return self.final_clusters

        #
        # merge the clusters
        #
        thresh = compute_clusters_threshold(clusters, nstds=1.5)
        is_mergable = lambda c: c.error >= thresh

        params = dict(kwargs)
        params.update({
            'cols':
            self.cols,
            'err_func':
            self.err_func,
            'influence':
            lambda c: self.influence_cluster(c),
            'influence_components':
            lambda c: self.influence_cluster_components(c),
            'is_mergable':
            is_mergable,
            'use_mtuples':
            False,
            'learner':
            self
        })
        self.merger = Merger(**params)
        merged_clusters = self.merger(clusters)
        merged_clusters.sort(key=lambda c: c.error, reverse=True)

        clusters.extend(merged_clusters)
        normalize_cluster_errors(clusters)
        clusters = list(set(clusters))
        self.all_clusters = clusters
        self.final_clusters = merged_clusters

        self.costs = {'cost_learn': self.cost_learn}
        return self.final_clusters
Beispiel #41
0
    def assemble(self):
        """
        Builder method: build a Chain of linked Components
        :return:
        """
        log.info('Assembling Chain: %s...' % self.chain_str)

        # Create linked list of input/filter/output (ETL Component) objects
        chain_str = self.chain_str
        sub_comps = []
        while chain_str:
            chain_str = chain_str.strip()

            # Check and handle Splitter construct
            # e.g. input_xml_file |(transformer_xslt|output_file) (output_std) (transformer_xslt|output_std)
            if chain_str.startswith('('):
                etl_section_name, chain_str = chain_str.split(')', 1)
                etl_section_name = etl_section_name.strip('(')

                # Check for subchain (split at Filter level)
                if '|' in etl_section_name:
                    # Have subchain: use Chain to assemble
                    sub_chain = Chain(etl_section_name, self.config_dict)
                    sub_chain.assemble()
                    child_comp = sub_chain.first_comp
                else:
                    # Single component (Output) to split
                    child_comp = factory.create_obj(self.config_dict,
                                                    etl_section_name.strip())

                # Assemble Components (can be subchains) for Splitter later
                sub_comps.append(child_comp)
                if '(' in chain_str:
                    # Still components (subchains) to assemble for Splitter
                    continue

            if len(sub_comps) > 0:
                if chain_str.startswith('|'):
                    # Next component is Merger with children
                    etl_comp = Merger(self.config_dict, sub_comps)
                    dummy, chain_str = chain_str.split('|', 1)
                else:
                    # Next component is Splitter with children
                    etl_comp = Splitter(self.config_dict, sub_comps)
                sub_comps = []
            else:

                # "Normal" case: regular Components piped in Chain
                if '|' in chain_str:
                    # More than one component in remaining Chain
                    etl_section_name, chain_str = chain_str.split('|', 1)
                else:
                    # Last element, we're done!
                    etl_section_name = chain_str
                    chain_str = None

                # Create the ETL component by name and properties
                etl_comp = factory.create_obj(self.config_dict,
                                              etl_section_name.strip())

            # Add component to end of Chain
            self.add(etl_comp)
    parser = OptionParser(version="%prog " + __VERSION__,
                          usage=usage,
                          description=banner)

    parser.add_option("--dir",
                      "-d",
                      action="store",
                      type="string",
                      dest="dir",
                      help="Files match (Default: *.ics)",
                      default="*.ics")
    parser.add_option("--ical",
                      "-i",
                      action="store",
                      type="string",
                      dest="icalfile",
                      help="iCalendar file output")

    (options, args) = parser.parse_args()

    if options.icalfile == "":
        options.icalfile = None

    if options.icalfile != None:
        options.icalfile = os.path.realpath(options.icalfile)
        Merger(options.dir, options.icalfile)
        sys.exit(0)

    sys.exit(1)
Beispiel #43
0
	def QC_merge_runs(self):
		# if this is a germline sample, QC all of the normal runs with each other.
		if self.sample_json['sample_type'] == 'germline':
			# Use the sample_status here to not re-run the QC and to not overwrite run status. The 'sample_status' should be reset to 'pushed' when new runs are pushed..
			#if self.sample_json['sample_status'] != 'pending_merge' and self.sample_json['sample_status'] != 'pending_3x3_review' and self.sample_json['sample_status'] != 'merged':
			# if the user specified the '--pass_fail' option, then run this part still
			if self.sample_json['sample_status'] == 'pushed' or self.options.pass_fail or self.options.qc_all:
				# QC the normal runs with each other
				self.QC_runs(self.sample_json['runs'])
				# write the sample json file
				write_json(self.sample_json['json_file'], self.sample_json)
	
			# what if there is only one run that passes all of the metrics? It should be marked as the 'final_json' and have the 'pass_fail_merged' flag marked as pass.
			# make the merger
			merger = Merger(self.sample_json['json_file'])
			# Check to see if the normal runs are ready to be merged.
			merge = merger.check_merge(self.sample_json['runs'])
			if merge == True:
				# merge the normal and/or tumor runs. Will only merge the passing runs with each other.
				merger.merge_runs('germline')

				# load the sample json file because merger edited it.
				self.sample_json = json.load(open(self.sample_json['json_file']))

				# update the merged run status
				merger.update_merged_run_status(self.sample_json['merged_json'])
	
				if json.load(open(self.sample_json['merged_json']))['pass_fail_merged_status'] == 'pass':
					# Set the sample_status
					self.sample_json['sample_status'] = 'merged'
					# cleanup the individual run bam files
					self.cleanup_sample.cleanup_runs(self.sample_json['runs'], self.sample_json['analysis']['settings']['cleanup'], self.no_errors)
					# Cleanup the merged dir 
					self.cleanup_sample.cleanup_runs([self.sample_json['merged_json']], self.sample_json['analysis']['settings']['cleanup'], self.no_errors)
				else:
					self.sample_json['sample_status'] = 'awaiting_more_sequencing'
	

		# if this is a tumor_normal sample, find the normal and tumor runs, and then QC them with each other.
		elif self.sample_json['sample_type'] == 'tumor_normal':
			# Separate the runs into tumor and normal lists
			normal_runs, tumor_runs = self.getTumor_Normal()
	
			if self.sample_json['analysis']['settings']['type'] == 'all_tumor_normal':
				# Use the sample_status here to not re-run the QC and to not overwrite run status. The 'sample_status' should be reset to 'pushed' when new runs are pushed..
				#if self.sample_json['sample_status'] != 'pending_merge' and self.sample_json['sample_status'] != 'pending_3x3_review' and self.sample_json['sample_status'] != 'merged':
				# if the user specified the '--pass_fail' option, then run this part still
				if self.sample_json['sample_status'] == 'pushed' or self.options.pass_fail or self.options.qc_all:
					# QC the normal or tumor runs with each other
					self.QC_runs(normal_runs, 'normal_')
					self.QC_runs(tumor_runs, 'tumor_')
					# now QC the tumor and normal runs together.
					self.QC_normal_tumor_runs(normal_runs, tumor_runs)
					# make the excel spreadsheet containing the data and copy it back to the proton
					#self._make_xlsx()
					# write the sample json file
					write_json(self.sample_json['json_file'], self.sample_json)
	
				# make the merger
				merger = Merger(self.sample_json['json_file'])
				# Check to see if the normal runs are ready to be merged.
				merge_normal = merger.check_merge(normal_runs, 'Normal/', 'normal_')
				if merge_normal == True:
					# merge the normal and/or tumor runs. Will only merge the passing runs with each other.
					merger.merge_runs('normal', 'Normal_', 'normal_')
	
				# Check to see if the tumor runs are ready to be merged.
				merge_tumor = merger.check_merge(tumor_runs, 'Tumor/', 'tumor_')
				if merge_tumor == True:
					merger.merge_runs('tumor', 'Tumor_', 'tumor_')

				# load the sample json file because merger edited it.
				self.sample_json = json.load(open(self.sample_json['json_file']))
	
				# If any runs were merged, QC them. If there are only 1 normal and tumor run, they won't be QCd again. 
				#if normal_merge_dir != '' or tumor_merge_dir != '' or (len(normal_passing_bams) == 1 and len(tumor_passing_bams) == 1):	
				# only QC all for the actual merged runs for now (PNET).
				# now QC the tumor and normal merged bams together if both normal and tumor runs are ready.
				if merge_normal or merge_tumor and ('merged_normal_json' in self.sample_json and 'merged_tumor_json' in self.sample_json):
					self.sample_json, qc_json = self.qc_run.QC_2Runs(self.sample_json, self.sample_json['merged_normal_json'], self.sample_json['merged_tumor_json'], 'normal_', 'tumor_', '_merged')
					self.sample_json, merged_perc_avail_bases = self.qc_run.update_3x3_runs_status(self.sample_json, self.sample_json['merged_normal_json'], self.sample_json['merged_tumor_json'], qc_json)
					# update the merged run status 
					merger.update_merged_run_status(self.sample_json['merged_normal_json'], merged_perc_avail_bases)
					merger.update_merged_run_status(self.sample_json['merged_tumor_json'], merged_perc_avail_bases)

					# cleanup the individual run bam files
					if merged_perc_avail_bases > .9:
						# Cleanup the PTRIM.bam and chr bam files after all of the QC is done.
						# are there any other files to clean up?
						self.cleanup_sample.cleanup_runs(self.sample_json['runs'], self.sample_json['analysis']['settings']['cleanup'], self.no_errors)
						#self.cleanup_sample.delete_runs(runs, self.sample_json['analysis']['settings']['cleanup'], self.no_errors)

						# Cleanup after the merging QC is done.
						self.cleanup_sample.cleanup_runs([self.sample_json['final_normal_json'], self.sample_json['final_tumor_json']], self.sample_json['analysis']['settings']['cleanup'], self.no_errors)

						# Set the sample_status
						self.sample_json['sample_status'] = 'merged_pass'
					else:
						self.sample_json['sample_status'] = 'awaiting_more_sequencing'

		# print the final status
		if self.no_errors == False or self.qc_run.no_errors == False:
			sys.stderr.write("%s finished with errors. See %s/sge.log for more details"%(self.sample_json['sample_name'], self.sample_json['output_folder']))
			self.sample_json['sample_status'] == 'failed'
			write_json(self.sample_json['json_file'], self.sample_json)
			sys.exit(1)
		else:
			print "%s finished with no errors!"%(self.sample_json['sample_name'])

		# write the sample json file
		write_json(self.sample_json['json_file'], self.sample_json)

		# make the excel spreadsheet containing the data and copy it back to the proton
		self._make_xlsx()
import matplotlib.pyplot as plt
import numpy as np
from keras import Sequential
from keras.callbacks import History
from keras.layers import Dense, BatchNormalization
from sklearn.model_selection import train_test_split

from loader import Loader
from merger import Merger

params, scores = Loader.get_flow_data(6767, 100)
qualities = Loader.get_task_qualities()
description = Loader.get_description(6767)

merger = Merger(params, description, scores, qualities)
X, y = merger.merge(100)

# Split
X_train, X_test, y_train, y_test = train_test_split(X,
                                                    y,
                                                    test_size=0.33,
                                                    random_state=42)

# model

# --> 0.0017802061972600456
model = Sequential()
model.add(Dense(32, input_shape=(X.shape[1], ), activation='relu'))
model.add(BatchNormalization())
model.add(Dense(32, activation='relu'))
model.add(BatchNormalization())
Beispiel #45
0
value = 0
for code, num in case.items():
    print code, num, 
    value += num
print
print value

print str(timeit.timeit('test(case)', 'from __main__ import test, case', number=1)) + 's used.'

TEST_CASES = [
                ['012', '013', '023', '123'],
                ['012', '013', '023', '124', '134', '234'],
                ['012', '013', '023', '123', '124', '134', '234'],
                ['012', '013', '023', '123', '123', '124', '134', '234'],
                ['012', '013', '014', '023', '024', '034', '123', '124', '134', '234'],
                ['012', '023', '013', '123', '123', '234', '134', '124', '125', '127', '157', '257', '125', '127', '157'],
                ['012', '023', '013', '123', '123', '234', '134', '124', '125', '125', '127', '157', '257', '125', '127', '157'],
             ]

codes = Merger([]).get_codes(3)
case = TEST_CASES[0]
# case = [to_string(random.choice(codes)) for dummy_i in range(10000)]
# case = TEST_CASES[3] * 10
# case = TEST_CASES[0] + TEST_CASES[4]
# case = TEST_CASES[6]

# print str(timeit.timeit('test(case)', 'from __main__ import test, case', number=1)) + 's used.'

# print len(" sdsd\n\n \t".strip())
Beispiel #46
0
 def disc_merge(self, cluster, dim, vals, skip=None):
   merged = Merger.disc_merge(self, cluster, dim, vals, skip)
   if merged:
     merged.c_range = list(self.c_range)
     merged.inf_func = merged.create_inf_func(self.learner.l)
   return merged