def get_image_names(path, K=0, p=0, q=1): ## List of image names try: _list = sb.check_output("cd "+path+"; ls", shell=True).splitlines() except: _list = sb.check_output("cd "+path+" && dir", shell=True).splitlines() image_names = list(map(lambda x: x.decode("UTF-8"),_list)) if (os_ == "windows"): image_names = list(map(lambda im : "im_"+im.split(" im_")[-1], image_names)) image_names = list(filter(lambda im : "im_" in im and ".png" in im, image_names)) if (q-1<0): N = len(image_names) image_names = random.sample(image_names, int(q*N)) image_names = list(filter(lambda im : len(im.split("_"))-1 == 2, image_names)) random.shuffle(image_names) if (K > 1): ## Then perform cross-validation image_names = list_split(image_names, K) else: if (not p): image_names = [image_names] else: N = len(image_names) valid_idx = random.sample(range(N), int(p*N)) training_set, validation_set = [], [] for i in range(N): if (i in valid_idx): validation_set.append(image_names[i]) else: training_set.append(image_names[i]) image_names = [training_set, validation_set] return image_names
def create_infileslist_from_inputs_list(inputs_list, intensors_desc): fileslist = [] inputlistcount = len(inputs_list) intensorcount = len(intensors_desc) if os.path.isfile(inputs_list[0]) == True: chunks = inputlistcount // intensorcount fileslist = list(list_split(inputs_list, chunks)) logger.debug( "create intensors list file type inlistcount:{} intensorcont:{} chunks:{} files_size:{}" .format(inputlistcount, intensorcount, chunks, len(fileslist))) elif os.path.isdir(inputs_list[0]) and inputlistcount == intensorcount: fileslist = [get_fileslist_from_dir(dir) for dir in inputs_list] logger.debug( "create intensors list dictionary type inlistcount:{} intensorcont:{} files_size:{}" .format(inputlistcount, intensorcount, len(fileslist))) else: logger.error( 'create intensors list filelists:{} intensorcont:{} error create'. format(inputlistcount, intensorcount)) raise RuntimeError() infileslist = create_infileslist_from_fileslist(fileslist, intensors_desc) if len(infileslist) == 0: logger.error( 'create_infileslist_from_fileslist return infileslist size: {}'. format(len(infileslist))) raise RuntimeError() return infileslist
def fit(self, X, y, max_depth=5, min_samples_split=2): """Build a regression decision tree. Note: At least there's one column in X has more than 2 unique elements y cannot be all the same value Arguments: X {list} -- 2d list object with int or float y {list} -- 1d list object with int or float Keyword Arguments: max_depth {int} -- The maximum depth of the tree. (default: {5}) min_samples_split {int} -- The minimum number of samples required to split an internal node (default: {2}) """ # Initialize with depth, node, indexes self.root.score = sum(y) / len(y) #Y的预测值 idxs = list(range(len(y))) #number of datas que = [(self.depth + 1, self.root, idxs)] #depth==2 # Breadth-First Search while que: depth, nd, idxs = que.pop(0) # Terminate loop if tree depth is more than max_depth if depth > max_depth: depth -= 1 break # Stop split when number of node samples is less than # min_samples_split or Node is 100% pure. if len(idxs) < min_samples_split or \ all(map(lambda i: y[idxs[0]] == y[i], idxs)): continue # Stop split if no feature has more than 2 unique elements split_ret = self._choose_feature(X, y, idxs) if split_ret is None: continue # Split _, feature, split, split_avg = split_ret # Update properties of current node nd.feature = feature nd.split = split nd.left = Node(split_avg[0]) nd.right = Node(split_avg[1]) # Put children of current node in que idxs_split = list_split(X, idxs, feature, split) que.append((depth + 1, nd.left, idxs_split[0])) que.append((depth + 1, nd.right, idxs_split[1])) # Update tree depth and rules self.depth = depth self._get_rules()
def first_method(self): print 'First method!' all_documentRankings = self.nodeManager.get_network_features() all_features = [] all_labels = [] doc_lenghts = [] doc_names = [] for i in all_documentRankings: ####### Modificar para MAchine learning com multiples tipos de redes complejas (segun limiares) Por ahora solo se utilizara como se fuese una sola red # allRankings_for_doc_i = all_documentRankings[i] # lista de diccionarios por cada red compleja (generadas por los limiares caso sea MLN o embeddings) / MODIFICAR LUEGO!!! doc_names.append(i) allRankings_for_doc_i = all_documentRankings[i][ 0] # lista de diccionarios por cada red compleja (generadas por los limiares caso sea MLN o embeddings) / MODIFICAR LUEGO!! document_data_for_doc_i = self.corpus[i] document_labels = get_labels(document_data_for_doc_i[1]) rankings = get_rankings(allRankings_for_doc_i) doc_lenghts.append(len(rankings)) all_features.extend(rankings) all_labels.extend(document_labels) for i, j in zip(all_features, all_labels): print i, j print '' print 'waaaaaaa' all_features = np.array(all_features) all_labels = np.array(all_labels) obj = KFoldCrossValidation(all_features, all_labels, self.classifier) predictions = obj.train_and_predict() for i, j in zip(all_labels, predictions): print i, j a = input() partitions = list_split(predictions, doc_lenghts) document_rankings = get_ml_rankings(doc_names, partitions) return document_rankings
def create_infileslist_from_fileslist(fileslist, intensors_desc): if len(intensors_desc) != len(fileslist): logger.error('fileslist:{} intensor:{} not match'.format( len(fileslist), len(intensors_desc))) raise RuntimeError() files_count_per_batch, runcount = get_files_count_per_batch( intensors_desc, fileslist) files_perbatch_list = [ list(list_split(fileslist[j], files_count_per_batch)) for j in range(len(intensors_desc)) ] infileslist = [] for i in range(runcount): infiles = [] for j in range(len(intensors_desc)): logger.debug( "create infileslist i:{} j:{} runcount:{} lists:{} filesPerPatch:{}" .format(i, j, runcount, files_perbatch_list[j][i], files_count_per_batch)) infiles.append(files_perbatch_list[j][i]) infileslist.append(infiles) return infileslist
def tokenize(keyword='train'): """ Setup input pipeline Returns: articles_int, abstracts_int, guiding_objects, object_sequences, left_mask, left_input, right_mask, right_input, tokenizer """ print('Start tokenize', keyword, 'data...') a_path = [s for s in os.listdir(config.articles_path) if keyword in s] e_path = [s for s in os.listdir(config.objects_path) if keyword in s] print('\tLoading articles, abstracts and objects...') articles_abstracts_data = [] for f in a_path: articles_abstracts_data.extend([ data for data in _gen_data_from_bin_file(config.articles_path + f) ]) articles_data_all = [ article.features.feature['article'].bytes_list.value[0].decode() for article in articles_abstracts_data ] abstracts_data_all = [ article.features.feature['abstract'].bytes_list.value[0].decode() for article in articles_abstracts_data ] abstracts_data_all = [ config.start_token + ' ' + abstracts + ' ' + config.end_token for abstracts in abstracts_data_all ] objects_data_all = [] for f in e_path: objects_data_all.extend([ line for line in open(config.objects_path + f, 'r', encoding='UTF-8') ][3::4]) objects_data_all = [([ re.split(r'\s?<sep>\s?', s)[:-1] for s in re.split( r'^<s>\s?|\s?</s>\s?<s>\s?|\s?</s>$', objects.strip())[1:-1] ]) for objects in objects_data_all] objects_data_all = [[obj for objs in objects for obj in objs] for objects in objects_data_all] articles_data_all = articles_data_all[:config.data_size] abstracts_data_all = abstracts_data_all[:config.data_size] objects_data_all = objects_data_all[:config.data_size] print('\tDividing sentences...') nltk_tokenizer = nltk.data.load('tokenizers/punkt/english.pickle') sentences_data_all = [ nltk_tokenizer.tokenize(article) for article in articles_data_all ] print('\tTokenizing...') tokenizer = pickle.load(open('tokenizer.pickle', 'rb')) tokenizer.num_words = config.vocab_size tokenizer.oov_token = config.oov_token oov_index = tokenizer.word_index[config.oov_token] sentences_int_all = [ tokenizer.texts_to_sequences(sentences) for sentences in sentences_data_all ] abstracts_int_all = tokenizer.texts_to_sequences(abstracts_data_all) objects_int_all = [ tokenizer.texts_to_sequences(objs) for objs in objects_data_all ] print('\tGenerating partial articles...') articles_int, abstracts_int, guiding_objects, object_sequences, left_int, right_int, origin_idx = [], [], [], [], [], [], [] for i in range(len(sentences_int_all)): # for each article # 不足objects_max_num个非oov object的,排除掉 non_oov_obj = [] for obj in objects_int_all[i]: if obj != [oov_index] and obj != [ oov_index, oov_index ] and obj != [oov_index, oov_index, oov_index]: non_oov_obj.append(obj) if len(set([tuple(obj) for obj in non_oov_obj])) < config.objects_max_num: continue if [1] in non_oov_obj: print(non_oov_obj) object_words = [word for obj in non_oov_obj for word in obj] # 对article中的每一句话,如果摘要的所有object中有词在这句话中出现,将这句话添加到part_article中 part_article = [] for sentence in sentences_int_all[i]: for word in object_words: if word in sentence: part_article.extend(sentence) break if not part_article: continue # 选出guiding_object和object_sequence candidates = [tuple(t) for t in non_oov_obj] + [ tuple([word]) for o in non_oov_obj for word in o if len(o) > 1 ] for word in candidates: if word == [oov_index]: candidates.remove(word) candidates = Counter(candidates) candidates, times = zip( *sorted(candidates.items(), key=lambda x: x[1], reverse=True)) if times[0] == 1: # 如果都只出现过一次,就只从原始实体里取 candidates = non_oov_obj else: candidates = [list(obj) for obj in candidates] max_times = times[0] min_diff = 99999 left_abstract, right_abstract = [], [] for k, obj in enumerate(candidates): if times[k] < max_times: if not left_abstract: # 如果前面的objects全部分割失败 max_times = times[k] else: # 如果已经分割成功,退出循环 break if len(obj) == 1 and obj not in non_oov_obj: # 对于不是原始实体的单个单词 if obj[0] < 400 or nltk.pos_tag( tokenizer.index_word[obj[0]])[0][1] not in [ 'NN', 'NNS', 'NNP', 'NNPS' ]: # 如果单词是高频词或不是名词,排除掉 candidates.remove(obj) continue try: left_abstract, right_abstract = list_split( abstracts_int_all[i], obj) if abs(len(left_abstract) - len(right_abstract)) < min_diff: # 选择分割得到的结果两边长度差异最小的 guiding_object = obj min_diff = abs(len(left_abstract) - len(right_abstract)) except ValueError: # 分割失败 continue if not left_abstract: # 如果全部objects分割失败 continue candidates.remove(guiding_object) object_sequence = [ word for obj in candidates[:config.objects_max_num - 1] for word in obj ] articles_int.append(part_article) abstracts_int.append(abstracts_int_all[i]) guiding_objects.append(guiding_object) object_sequences.append(object_sequence) left_int.append(left_abstract) right_int.append(right_abstract) origin_idx.append(i) print('\tGenerating input data and mask...') left_mask = [[1] * len(left) + [0] * len(guiding_objects[i] + object_sequences[i]) for i, left in enumerate(left_int)] left_input = [ left + guiding_objects[i] + object_sequences[i] for i, left in enumerate(left_int) ] right_mask = [[0] * len(left + guiding_objects[i]) + [1] * len(right_int[i]) for i, left in enumerate(left_int)] right_input = [ left + guiding_objects[i] + right_int[i] for i, left in enumerate(left_int) ] # padding articles_int = pad_sequences(articles_int, maxlen=config.articles_maxlen, padding='post', truncating='post') left_mask = pad_sequences(left_mask, maxlen=config.left_abstracts_maxlen + 5, padding='pre', truncating='pre') left_input = pad_sequences(left_input, maxlen=config.left_abstracts_maxlen + 5, padding='pre', truncating='pre') right_mask = pad_sequences(right_mask, maxlen=config.left_abstracts_maxlen + config.right_abstracts_maxlen + 5, padding='post', truncating='post') right_input = pad_sequences(right_input, maxlen=config.left_abstracts_maxlen + config.right_abstracts_maxlen + 5, padding='post', truncating='post') # reverse left left_mask = left_mask[:, ::-1] left_input = left_input[:, ::-1] return articles_int, abstracts_int, guiding_objects, object_sequences, left_mask, left_input, right_mask, right_input, tokenizer
def create_graphic(db, tb, reach = True, pkt_loss = True, jitter = True, latency = True, start = None, end = None, mode = 'average', name = False, dpi = 200): """ method for plotting data from sql file (db,tb) with a certain mode: average - divides the list in 100 smaller lists and for each division makes its average max - divides the list in 100 smaller lists and for each division returns the max value min - divides the list in 100 smaller lists and for each division returns the min value fractile_x - divides the list in 100 smaller lists and for each division returns the x% fractile (default should be 49%) start/end = should be in format dd/mm/yy-HH:MM:SS name = name to give to file """ if not any([reach, jitter, pkt_loss, latency]): print 'No data to plot selected' return False reach = ',Reachability' if reach else '' pkt_loss = ',Pkt_loss' if pkt_loss else '' jitter = ',Jitter' if jitter else '' latency = ',Latency' if latency else '' start = time_epoch(tm = start) if start else None end = time_epoch(tm = end) if end else None data_get = 'Time%s%s%s%s' %(reach, pkt_loss, jitter, latency) #read data from the sql file data = sql().get_data(db = db,tb = tb, field = data_get, start = start, end = end) #get the name of the host and set is as a name host = '.'.join(tb.split('_')) header = data.pop(0) txt = None #proccess data time_list = [i[header['Time']] for i in data] # if mode is not None: #### wil continue with most relevant output: min for reachability, max for others and average for time time_list= list_split(list = time_list, mode = mode if mode else 'average') time_list = [dt.fromtimestamp(i) for i in time_list] # else: # time_list= list_split(list = time_list, mode = None) # time_list = [dt.fromtimestamp(i) for i in time_list] if reach: reach_list = [i[header[reach.lstrip(',')]] for i in data] reach_list = list_split(list = reach_list, mode = mode if mode else 'fractile_03') if not reach_list: txt = 'No data available' if pkt_loss: pkt_loss_list = [i[header[pkt_loss.lstrip(',')]] for i in data] pkt_loss_list = list_split(list = pkt_loss_list, mode = mode if mode else 'fractile_97') if not pkt_loss_list: txt = 'No data available' if jitter: jitter_list = [i[header[jitter.lstrip(',')]] for i in data] jitter_list = list_split(list = jitter_list, mode = mode if mode else 'fractile_97') if not jitter_list: txt = 'No data available' if latency: latency_list = [i[header[latency.lstrip(',')]] for i in data] latency_list = list_split(list = latency_list, mode = mode if mode else 'fractile_97') if not latency_list: txt = 'No data available' fig, ax1 = plt.subplots() plt.setp(ax1.get_xticklabels(), rotation=45) cwd = os.getcwd() if not name: name = cwd + '/graphs/' + host + '_' + str(time()) + '.png' if txt: plt.text(0.3, 0.5, txt, fontsize = 14) plt.savefig(name, dpi = dpi, papertype = 'A4', bbox_inches = 'tight') plt.close() return False #plotting reach and jitter in percents with axis on left side # and jitter/latency with miliseconds on right side if reach: r = ax1.plot(time_list, reach_list, 'g', label = 'Reachability',linewidth=1.0) if pkt_loss: p = ax1.plot(time_list, pkt_loss_list, 'r', label = 'Packet Loss', linewidth=1.0) ax1.set_yticks([i for i in range(0,101,10)]) ax1.set_xlim([time_list[0], time_list[-1]]) ax1.minorticks_on() # Make the y-axis label, ticks and tick labels match the line color. if reach and pkt_loss: ax1.set_ylabel('Reachabilty/Packet loss (%)') elif reach and not pkt_loss: ax1.set_ylabel('Reachabilty (%)') elif not reach and pkt_loss: ax1.set_ylabel('Packet loss (%)') ax2 = ax1.twinx() if latency: l = ax2.plot(time_list, latency_list, 'y', label = 'Latency', linewidth=1.0) if jitter: j = ax2.plot(time_list, jitter_list, 'm', label = 'Jitter', linewidth=1.0) if jitter and latency: ax2.set_ylabel('Jitter/Latency (ms)') elif jitter and not latency: ax2.set_ylabel('Jitter (ms)') elif not jitter and latency: ax2.set_ylabel('Latency (ms)') legend = [] if reach: legend = legend + r if pkt_loss: legend = legend + p if jitter: legend = legend + j if latency: legend = legend + l ax2.minorticks_on() ax2.legend(bbox_to_anchor=(0., 1.02, 1., 0.102), loc =3, handles = legend, mode = "expand", borderaxespad=0., ncol=4) ax2.tick_params(axis = 'y', which = 'minor', bottom = 'off') ax2.set_xlim([time_list[0], time_list[-1]]) #fig.tight_layout() #plt.savefig(name, dpi = dpi, papertype = 'A4', bbox_inches = 'tight') fig = plt.gcf() fig.canvas.set_window_title(name) plt.show() plt.close()