def create_h5(self, split_num, iter_num, input_sequence, DEL_TRAIN_WAV): cpu_cores = int(split_num / iter_num) tmp1 = [] tmp2 = [] tmp3 = [] # noisy_dir = join(noisy_dir, 'train') training_data_list = search_wav(self.noisy_dir) print('Total training files: ', len(training_data_list)) for file in training_data_list: try: snr, noise_name, clean_name1, clean_neme2 = file.split( '/')[-1].split('_') clean_file = join( self.noisy_dir, '_'.join(['0ms', 'n0', clean_name1, clean_neme2])) noisy_file = file except: snr, noise_name, clean_name = file.split('/')[-1].split('_') clean_file = join(self.noisy_dir, '_'.join(['0ms', 'n0', clean_name])) noisy_file = file tmp1.append(clean_file) tmp2.append(noisy_file) training_num = 30000 t1, t2 = shuffle(np.array(tmp1), np.array(tmp2)) t1 = t1[:training_num] t2 = t2[:training_num] clean_split_list = split_list(t1, wanted_parts=split_num) noisy_split_list = split_list(t2, wanted_parts=split_num) start = 0 end = cpu_cores for num in range(iter_num): print(start, end) pool = Pool(cpu_cores) func = partial(_create_split_h5, clean_split_list, noisy_split_list, self.save_h5_dir, self.save_h5_name, input_sequence) pool.map(func, range(start, end)) pool.close() pool.join() start = end end += cpu_cores if DEL_TRAIN_WAV: shutil.rmtree(self.noisy_dir)
def remove_list(update: Update, context: CallbackContext): lists = db.get_lists() buttons = [l.get("name") for l in lists] reply_markup = ReplyKeyboardMarkup(split_list(buttons, 3)) update.message.reply_text("Choose list:", reply_markup=reply_markup) return conversations["remove_list"]["choose_list"]
def improved_create_users_from_ids(user_ids): """ create_users_from_idsの改善版 時間が100分の1になったやつ(プロフィールのクエリだけまとめてIDを飛ばせることに気づいた) """ users = [] user_ids_list = utils.split_list(user_ids, 100) for index, ids in enumerate(user_ids_list): utils.print_step_log("CreateUsersList", index, len(user_ids_list)) try: profs = tg.get_user_profiles(ids) except: traceback.print_exc() sleep(1) continue if profs == None or profs == []: continue for prof in profs: user = User(id=prof['id'], name=prof['name'], description=prof['description'], friends_count=prof['friends_count'], created_at=dt.strptime(prof['created_at'], "%a %b %d %H:%M:%S +0000 %Y"), is_protected=prof['protected']) users.append(user) sleep(1) return users
async def members(self, ctx: utils.CustomContext, *, role: Role): """Check the list of members in a certain role. Permissions needed: `Manage Messages`""" in_role = [] [ in_role.append(f"{member.mention} ({member})") for member in role.members ] columns = [in_role, ["\u200b"]] if len(in_role) > 1: columns[0], columns[1] = utils.split_list(in_role) columns.sort(reverse=True) if len("\n".join(columns[0])) > 1024: columns[0] = columns[0][:20] if len("\n".join(columns[1])) > 1024: columns[1] = columns[1][:20] embed = self.bot.embed( ctx, title=f"Members in {role.name} [{sum(1 for m in role.members)}]") [ embed.add_field(name="\u200b", value="\n".join(column) if column else "\u200b") for column in columns ] await ctx.send(embed=embed)
def replace_boilerplate(message, related, region_name): proposals, addresses = split_list(lambda x: isinstance(x, Proposal), related) return template_replace(message, { "region": region_name, "proposals": "\n<br/>".join(f"{p.address} ({p.case_number})" for p in proposals), "addresses": "\n<br/>".join(addr[0] for addr in addresses) }).replace("\n", "\n<br/>")
def __init__(self, formula_path, dict_path, separate_conj_stmt=False, binary=False, part_no=-1, part_total=0, file_list=None, deepmath=False, norename=False, filter_abelian=False, compatible=False): # part_no, part_total: will not shuffle. self.formula_path = formula_path self.dict_path = dict_path self.maxsize = 500 # maxsize for async queue self.iter_ = 0 # epoch. Legacy reason for its name self.total_in_epoch = -1 # conj, stmt pairs supply in current epoch. self.total_iter = -1 # total iteration self.rename = not norename if not os.path.exists(dict_path): self.dict = self.build_dictionary() else: self.dict = torch.load(dict_path) self.queue = Queue(self.maxsize) self.reader = Process(target=self.read) self.dict_size = len(self.dict.keys()) self.separate_conj_stmt = separate_conj_stmt self.binary = binary self.part_no = part_no self.part_total = part_total if file_list is None: file_list = os.listdir(self.formula_path) if part_total != 0: file_list.sort() file_list = split_list(file_list, part_total, part_no) else: if part_total != 0: file_list = split_list(file_list, part_total, part_no) self.file_list = file_list self.deepmath = deepmath self.filter_abelian = filter_abelian self.compatible = compatible
def parse_cp_mv(args): """ parse args like: 4 5 6 7 to asd bsd csd return [[tagsd,asdf,vvv,sdf],[4,5,6,7]] """ # here we split two arrays by <to> keyword args = u.sp_split(args) nodes,tags = u.split_list(args, 'to') return [nodes,u.everything_to_str(tags)]
def _make_registers(self, n_chunks=7): self.lowest_note = self.range[0] self.highest_note = self.range[-1] registers = list(utils.split_list(self.range, n_chunks=n_chunks)) self.middle_register = registers[3] # assuming 7 divisions self.highest_register = registers[-1] self.lowest_register = registers[0] self.safe_register = utils.flatten(registers[1:-1]) self.very_safe_register = utils.flatten(registers[2:-2])
def split_bycol(mat, rate=0.7): row, col = mat.shape col_list = sum(mat) total = sum(col_list) threhold = total * rate / col col_list /= threhold _splits = split_list(col_list, 0) res_list = filter(lambda res: res[1] != [], _splits) return res_list
def parallel_global_stiffness(mesh) -> GlobalStiffnessLil: N = len(mesh.nodes) shape = (2 * N, 2 * N) with Pool(N_PROCESSES) as pool: all_args = zip(split_list(list(mesh.elements.values()), N_PROCESSES), repeat(shape)) results = pool.starmap(GlobalStiffnessCoo.from_elements, all_args) K = sparse.csr_matrix(shape) for result in results: K += result K = K.tolil() # print('conversion done') return K
def load_frame_datas(data_path): data_path_list = glob.glob("%s/data_json/frame_*.json" % data_path) frame_data_list = [] number_of_peoples_in_frames = [] for i in range(len(data_path_list)): file_path = join(data_path, 'data_json', 'frame_%d.json' % i) with open(file_path, 'r') as f: pose_result = json.loads(f.read()) peoples = map(lambda people: utils.split_list(people['pose_keypoints'], 3), pose_result['people']) number_of_peoples_in_frames.append(len(peoples)) frame_data_list.append(peoples) return frame_data_list, number_of_peoples_in_frames
def split_bycol(mat,rate=0.7): row,col = mat.shape col_list = sum(mat) total = sum(col_list) threhold = total*rate/col col_list /= threhold _splits = split_list(col_list,0) res_list = filter(lambda res: res[1] != [],_splits) return res_list
def parse(full_input: List[Word]) -> Optional[ParseResult]: inputs = split_list(full_input, separators) if len(inputs) <= 1: # There were no occurrences of the separators. return FailureParse() results = [single_action().parse(words) for words in inputs] filtered = [result for result in results if result.is_success()] # Ignore partials actions = [r.parsed for r in filtered] return SuccessParse(Composite(actions), 1.0, [])
async def index(request): points = Point.select() init_point = points[0] if points else default_point_factory() lines = [line for line in split_list(points, lambda p: p.is_newline)] return { "points": points, "token": yandex, "init_point": init_point, "lines": lines }
def replace_boilerplate(message, related, region_name): proposals, addresses = split_list(lambda x: isinstance(x, Proposal), related) return template_replace( message, { "region": region_name, "proposals": "\n<br/>".join(f"{p.address} ({p.case_number})" for p in proposals), "addresses": "\n<br/>".join(addr[0] for addr in addresses) }).replace("\n", "\n<br/>")
def multi_process_generate(load_dir, save_dir, savename): """" Extracts datapoints from all .json files in train_dir and saves the them in a new .csv file :param load_dir: The directory to load from :param save_dir: The directory to save the extracted headers :param savename: The filename to save :param num_headers: The amount of headers to use as datapoint """ csvfiles = [] for csv in glob.iglob(save_dir + '*.csv'): csvfiles.append(os.path.basename(csv)) # Load all files files = [] for fullname in glob.iglob(load_dir + '*.json'): filename = os.path.basename(fullname) csvname = filename.split('.')[0] + '.csv' if csvname in csvfiles: os.rename(fullname, load_dir + 'processed_json/' + filename) else: files.append(fullname) manager = multiprocessing.Manager() dataframes = manager.list() filelist = glob.glob(load_dir + '*.json') splits_count = multiprocessing.cpu_count() filesplits = split_list(filelist, splits_count) threads = [] for split in filesplits: # create a thread for each t = multiprocessing.Process(target=save_parse_result, args=(split, dataframes)) threads.append(t) t.start() print(t.name + 'starting') # create one large dataframe for t in threads: t.join() print("Process joined: ", t) data = pd.concat(dataframes) if not os.path.exists(save_dir): os.makedirs(save_dir) data.to_csv(save_dir + savename, mode='w')
def parallel_calculate_array_values(self, U): for i in range(len(self.nodes)): self.nodes[i].values['displacement'] = np.array( [U[2 * i], U[2 * i + 1]]) with Pool(N_PROCESSES) as pool: element_groups = split_list(list(self.elements.values()), N_PROCESSES) strain_and_stress_arrays = pool.map(get_arrays, element_groups) for i, (strain_array, stress_array) in enumerate(strain_and_stress_arrays): group = element_groups[i] for j, element in enumerate(group): element.values['strain'] = strain_array[j] element.values['stress'] = stress_array[j]
def _custom_kcrossvalidation(self, x, y): ''' Function creates cross-validation folds for x and y inputs. :param x: 3d input array: ntrials x ntimepoints x nfeatures :param y: vector of labels: ntrials, :param k: number of splits of the data :param n_val: number of trials in validation set :return: Train, Val and Test lists of nfolds length, each item is a tuple of x and y data ''' n = x.shape[0] shape = x.shape[1:] x = x.reshape(n, -1) if self.shuffle: x, t = self._shuffle_data(x, y) l = range(n) test_folds = utils.split_list(l, self.k) Train, Val, Test = [], [], [] for i, t in enumerate(test_folds): xc = x.copy() yc = y.copy() Test.append([ xc[t].reshape([-1] + [shape[s] for s in range(len(shape))]), y[t] ]) xc = np.delete(xc, t, axis=0) yc = np.delete(yc, t, axis=0) xc = np.roll(xc, -len(t) * i, axis=0) yc = np.roll(yc, -len(t) * i, axis=0) Val.append([ xc[:len(t)].reshape([-1] + [shape[s] for s in range(len(shape))]), yc[:len(t)] ]) Train.append([ xc[len(t):].reshape([-1] + [shape[s] for s in range(len(shape))]), yc[len(t):] ]) return Train, Test, Val
def process_resources(threads): """Download all the unprocessed resources """ class Downloader(Thread): def __init__(self, resources): self.resources = resources super(Downloader, self).__init__() def run(self): boilerpipe.jpype.attachThreadToJVM() for res in self.resources: try: content = download(res.url) content = boilerpipe.transform(content) except: content = "" if content and len(content) >= 200: res.textual = True # we don't want documents of less that 25 chars if not content: res.blacklisted = True print "blacklisted %s" % res.url else: res.content = content print "downloaded %s" % res.url res.processed = True res.save() if threads > multiprocessing.cpu_count(): threads = multiprocessing.cpu_count() # initialise the JVM boilerpipe.start_jvm() resources = list(db.resources.Resource.find({'processed': False})) print "download %s urls using %s threads" % (len(resources), threads) # split the resource into the number of threads resources = split_list(resources, threads) # start the threads and pass them the resources to be processed for i in range(threads): d = Downloader(resources[i]) d.start()
def list_scrapers(self, update, context): """ Lista os scrapers e abre um teclado interativo Args: update (Update): Objeto com os dados do chat e do usuário. context (CallbackContext): Objeto de contexto. """ keyboard = [ InlineKeyboardButton( text=scraper_name, callback_data=f"\\scraper_selected:{scraper_name}", ) for scraper_name in self.scrapers ] keyboard_splitted = utils.split_list(input_list=keyboard, size=2) reply_markup = InlineKeyboardMarkup(inline_keyboard=keyboard_splitted) update.message.reply_text("Concursos cadastrados no bot:", reply_markup=reply_markup)
def process_pcap_to_h5(read_dir, save_dir, session_threshold=5000): """ Use this method to process all pcap files in a directory to a h5 format. Session threshold is used to filter out all sessions containing fewer packets :param save_dir: :param read_dir: Directory containing pcap files that should be converted into h5 format :param session_threshold: Threshold to filter out session with less packets :return: None """ h5files = [] for h5 in glob.iglob(save_dir + '*.h5'): h5files.append(os.path.basename(h5)) # Load all files files = [] for fullname in glob.iglob(read_dir + '*.pcap'): filename = os.path.basename(fullname) h5name = filename +'.h5' if h5name in h5files: os.rename(fullname, read_dir + '/processed_pcap/' + filename) else: files.append(fullname) splits = 4 files_splits = split_list(files, splits) processes = [] for file_split in files_splits: # create a thread for each t1 = multiprocessing.Process(target=save_pcap_task, args=(file_split, save_dir, session_threshold)) print("Starting process", t1) processes.append(t1) t1.start() for process in processes: process.join() print("Process joined", process)
def answer_xval_lr(args): """ Answer questions on a cross-validation dataset by doing the following: 1. Extract (or load) feature strings for the training and test set 2. Parse the feature strings to compute feature vectors. 2. ??? 3. Profit Args: args: ArgumentParser arguments defined in __main__ Returns: None """ pages_dict = pickle.load(open('../data/wiki_pages_dict.pkl', 'rb')) if not args.load: train_reader, _ = read_csv_data.read_csv_data() train = list(train_reader) random.shuffle(train) # split train for X-val if args.limit > 0: train = train[0:args.limit] trainx, testx = utils.split_list(train, (args.split, 100.-args.split)) print ("len(xval_train) = {}, len(xval_test) = {}"\ .format(len(trainx), len(testx))) analyzer = similarity.Analyzer() feat = similarity.Featurizer(analyzer, pages_dict) print ("Computing feature strings:") fs, fv = feat.compute_feat_strings(trainx + testx, print_info=True) pickle.dump((trainx, testx, fs, fv, analyzer, feat), open('../data/xval_feat_strings.pkl', 'wb'), pickle.HIGHEST_PROTOCOL) elif args.load: # load pre-comuted feature strings print ("Loading precomputed feature strings for trainx and testx:") trainx, testx, fs, fv, analyzer, feat = \ pickle.load(open('../data/xval_feat_strings.pkl', 'rb')) ## Here we do some cross-validation X = feat.compute_feats(fs) X = X.tocsr() # might already be CSR X.sort_indices() # needed for cosine-type measures #X_scaler = sklearn.preprocessing.StandardScaler(with_mean=False, with_std=True) #X = X_scaler.fit_transform(X) # try some LDA stuff print ("Training LDA topic model") topic_mod = lda.LDA(n_topics=20, n_iter=150) tm_analyzer = topic_model.Analyzer() tm_feat = topic_model.Featurizer(tm_analyzer, pages_dict) # use the same feature strings as similarity tm_fs = topic_model.add_wiki_categories(trainx+testx, fs, fv, pages_dict) topic_X = tm_feat.compute_feats(tm_fs) topics = topic_mod.fit_transform(topic_X) # gives probabilities for each topic #topics_scaler = sklearn.preprocessing.StandardScaler(with_mean=True, with_std=True) #topics = topics_scaler.fit_transform(topics) # compute similarity for each question and each answer (of 4) # use this as X (e.g. NLP similarity, LDA similarity) # binary classification with LR (i.e. is the answer right or not) print ("Evaluating train data:") X_lr_train, y_lr_train = compute_scores(trainx, X, fv,\ scorer=similarity.Scorer.cosine, topics=topics, train=True,\ print_info=True) print ("Training LR") # standardizing lr_scaler = sklearn.preprocessing.StandardScaler(with_mean=True, with_std=True) X_lr_train = lr_scaler.fit_transform(X_lr_train) # alpha sets the weight on regularization term lr = sklearn.linear_model.SGDClassifier(loss='log', penalty='l2',\ n_iter=100, shuffle=True, fit_intercept=True, class_weight={0:.1, 1:.9}) lr.fit(X_lr_train, y_lr_train) #lr.coef_[0,0] = 0.75 #lr.coef_[0,1] = 0.25 #lr.intercept_[0] = 0.0 print (lr.coef_) print (lr.intercept_) our_answers = lr_make_predictions(X_lr_train, lr) acc_trainx = compute_accuracy(trainx, our_answers) print ("Train accuracy = {}\n".format(acc_trainx)) print ("Evaluating test data:") X_lr_test = compute_scores(testx, X, fv,\ scorer=similarity.Scorer.cosine, topics=topics, print_info=True) X_lr_test = lr_scaler.transform(X_lr_test) our_answers = lr_make_predictions(X_lr_test, lr) acc_testx = compute_accuracy(testx, our_answers) print ("Test accuracy = {}\n".format(acc_testx))
def ipheadertask(filelist): j = 1 for fullname in filelist: print("Loading filenr: {}".format(j)) load_dir, filename = os.path.split(fullname) df = utils.load_h5(load_dir, filename) frames = df['bytes'].values for i, frame in enumerate(frames): p = np.fromstring(frame, dtype=np.uint8) if p[14] != 69: print("IP Header length not 20! in file {0}".format(filename)) j += 1 if __name__ == '__main__': filelist = glob.glob(load_dir + '*.h5') filesplits = utils.split_list(filelist, 4) threads = [] for split in filesplits: # create a thread for each t = multiprocessing.Process(target=ipheadertask, args=(split, )) threads.append(t) t.start() # create one large dataframe for t in threads: t.join() print("Process joined: ", t)
from utils import csv2list, split_list, list2csv all_path = '/data1/sap/frcnn_keras/data/mv_test_backup.txt' l1_path = '/data1/sap/frcnn_keras/data/mv_val.txt' l2_path = '/data1/sap/frcnn_keras/data/mv_test.txt' ratio = .5 all_list = csv2list(all_path) size = int(len(all_list) * ratio) l1, l2 = split_list(all_list, size) l1 = sorted(l1) l2 = sorted(l2) list2csv(l1_path, l1) list2csv(l2_path, l2)
def geocode_addresses(self, addresses): addresses = list(filter(None, map(str.strip, addresses))) geocoded = geocode_tuples(addresses, region=self.cleaned_data["region"]) return split_list(tuple.__instancecheck__, geocoded)
def answer_xval(args): """ Answer questions on a cross-validation dataset by doing the following: 1. Extract (or load) feature strings for the training and test set 2. Parse the feature strings to compute feature vectors. 2. ??? 3. Profit Args: args: ArgumentParser arguments defined in __main__ Returns: None """ pages_dict = pickle.load(open('../data/wiki_pages_dict.pkl', 'rb')) train_pos = pickle.load(open('pos/train_pos.pkl', 'rb')) test_pos = pickle.load(open('pos/test_pos.pkl', 'rb')) all_pos = train_pos + test_pos example_d = pickle.load(open('pos/sim_prob_dict.pkl', 'rb')) row_num = 0 old_ans = [] with open('pos/our_answers.csv', 'rb') as csvfile: ans_reader = csv.reader(csvfile, delimiter=',') for row in ans_reader: if row_num > 0: old_ans.append({'id':row[0],'correctAnswer':row[1]}) row_num += 1 if not args.load: train_reader, test_reader = read_csv_data.read_csv_data() train = list(train_reader) test = list(test_reader) all_data = train + test random.shuffle(train) # split train for X-val if args.limit > 0: train = train[0:args.limit] trainx, testx = utils.split_list(train, (args.split, 100.-args.split)) print ("len(xval_train) = {}, len(xval_test) = {}"\ .format(len(trainx), len(testx))) #analyzer = similarity.Analyzer() #feat = similarity.Featurizer(analyzer, pages_dict) #print ("Computing feature strings:") #fs, fv = feat.compute_feat_strings(trainx + testx, print_info=True) ##################################### #use_data = train #use_pos = train_pos use_data = all_data use_pos = all_pos ind = 0 num_this = 0 ans_types = {} num_q = 0 old_relevant = [] #for kk in trainx: for kk in use_data: for kk_pos in use_pos: #for kk_pos in train_pos: if kk_pos['id'] == kk['id']: break #for kk_old in old_ans: # if kk_old['id'] == kk['id']: # break #ans_types.append(question_features2(kk)) #ans_types.append(question_features2(kk_pos)) [k,t] = question_features2(kk_pos) if k != 0: ans_types[k] = t num_q += 1 #old_relevant.append(kk_old) ind += 1 sys.stdout.write("Parse Progress: %f%% \r" % (ind*100/float(len(use_data))) ) sys.stdout.flush() num_empty = 0 for ans in ans_types: if not(ans): num_empty += 1 pred_list = {} ind = 0 max_ind = len(use_data) for kk in range(0,len(use_data)): #if ind > max_ind: #break if use_data[kk]['id'] in ans_types.keys(): ind += 1 pred_list[use_data[kk]['id']] = answer_question(use_data[kk], \ ans_types[use_data[kk]['id']], pages_dict) else: ind += 1 pred_list[use_data[kk]['id']] = [] sys.stdout.write("Parse Progress: %f%% \r" % (ind*100/max_ind) ) sys.stdout.flush() ''' for kk in range(0,len(ans_types)): if ind > max_ind: break if (ans_types[kk]): ind += 1 #pred_list.append(google_ans(trainx[kk], ans_types[kk])) #pred_list.append(answer_question(trainx[kk], ans_types[kk], pages_dict)) pred_list.append(answer_question(use_data[kk], ans_types[kk], pages_dict)) else: ind += 1 pred_list.append([]) sys.stdout.write("Parse Progress: %f%% \r" % (ind*100/max_ind) ) sys.stdout.flush() ''' corr = 0 total = 0 for p in range(0,len(train)): q_key = train[p]['id'] if q_key in pred_list.keys(): if pred_list[q_key]: if pred_list[q_key] == train[p]['correctAnswer']: #if pred_list[p] == old_relevant[p]['correctAnswer']: corr += 1 total +=1 print ('Performance: ' + str(corr/float(total))) print ('Fraction Answered: ' + str(float(total)/float(len(use_data)))) final_answers = pickle_ans(pred_list, use_data) pdb.set_trace() filepath = 'pos/metric_dict_10_90.pkl' pickle.dump(final_answers,open(filepath, 'wb'))
# optimizer = BertAdam(warmup=0.05, t_total=len(train_dataloader)) model.zero_grad() model.to(device) for epoch in range(epoch): model.train() train_loss = 0 for label, query, l_query, pos, candidate_abstract, l_abstract,\ candidate_labels, l_labels, candidate_type, candidate_abstract_numwords,\ candidate_numattrs in tqdm(train_dataloader): if label.size()[0] == 1: continue #print(len(label)) n_split = 100 #if len(label > n_split): query_sp = split_list(query, n=n_split) l_query_sp = split_list(l_query, n=n_split) pos_sp = split_list(pos, n=n_split) candidate_abstract_sp = split_list(candidate_abstract, n=n_split) l_abstract_sp = split_list(l_abstract, n_split) candidate_labels_sp = split_list(candidate_labels, n_split) l_labels_sp = split_list(l_labels, n_split) candidate_type_sp = split_list(candidate_type, n_split) candidate_numattrs_sp = split_list(candidate_numattrs, n_split) candidate_abstract_numwords_sp = split_list( candidate_abstract_numwords, n_split) parts = len(query_sp) pred_set = [] for i in range(parts): query = query_sp[i]
def find_matching_proposals(self, region): proposals = self.cleaned_data["proposals"] return split_list(lambda p: p.region_name == region, proposals)
def test_split_list(): x = ['hello','from','hell'] assert u.split_list(x,'from') == [['hello'],['hell']] assert u.split_list(x,'hello') == [[],['from','hell']] assert u.split_list(x,'hell') == [['hello','from'],[]]
def library_chapter(self): """ 章节爬取动作 :return: """ start_url = self.get_chapter_url() try: self.driver.get(start_url) WebDriverWait(self.driver, 30).until( ec.visibility_of_element_located( (By.XPATH, '//div[@class="tree-head"]/span[@id="spanEdition"]'))) except TimeoutException as e: self.sinOut.emit('超时!!! %s' % str(e)) self.driver.get_screenshot_as_file('./error.png') return teaching = self.driver.find_element_by_xpath( '//div[@class="tree-head"]/span[@id="spanEdition"]').text level_name = self.driver.find_element_by_xpath( '//div[@class="tree-head"]/span[@id="spanGrade"]').text teaching = teaching.replace(':', '').replace(':', '') self.sinOut.emit('进行爬取章节!') if self.teaching_name != teaching or self.level_name != level_name: self.message_box.emit('警告', "没有数据!") return et = etree.HTML(self.driver.page_source) library_id = self.teaching sub_obj = et.xpath('//ul[@id="JYE_POINT_TREE_HOLDER"]/li') chapters_list = list() total = len(sub_obj) current_count = 0 for item in sub_obj: lc_item = dict() lc_item['id'] = str(uuid.uuid1()) pk = item.attrib.get('pk') nm = item.attrib.get('nm') child = utils.recursive_get_li(lc_item['id'], library_id, item) lc_item['pk'] = pk lc_item['parent_id'] = '' lc_item['library_id'] = library_id lc_item['name'] = nm lc_item['child'] = child chapters_list.append(lc_item) current_count += 1 self.crawler_chapter_progress.emit(current_count, total) self.sinOut.emit('正在解析入库') if chapters_list: mutex.acquire() chapters = self.db_connect.session.query( LibraryChapter.name, LibraryChapter.id, LibraryChapter.pk).filter( LibraryChapter.library_id == library_id) new_list = utils.split_list(chapters_list) if chapters.count() > 0: # 如果章节存在数据则进行更新 relational_dict = dict() for item in chapters: # new_list = self.update_chapter_pk_id(item.id, item.pk, new_list) for item2 in new_list: if item2.get('pk') == item.pk: relational_dict[item2['id']] = item.id item2['id'] = item.id break for item3 in new_list: if item3.get('parent_id') and relational_dict.get( item3['parent_id']): item3['parent_id'] = relational_dict.get( item3['parent_id']) chapters.delete() self.db_connect.session.commit() mutex.release() # 插入新值 for item in new_list: mutex.acquire() if 'child' in item: del item['child'] self.db_connect.add(LibraryChapter(**item)) mutex.release() self.sinOut.emit('章节爬取完成,重新加载查看')
def answer_xval(args): """ Answer questions on a cross-validation dataset by doing the following: 1. Extract (or load) feature strings for the training and test set 2. Parse the feature strings to compute feature vectors. 2. ??? 3. Profit Args: args: ArgumentParser arguments defined in __main__ Returns: None """ pages_dict = pickle.load(open('../data/wiki_pages_dict.pkl', 'rb')) if not args.load: train_reader, _ = read_csv_data.read_csv_data() train = list(train_reader) random.shuffle(train) # split train for X-val if args.limit > 0: train = train[0:args.limit] trainx, testx = utils.split_list(train, (args.split, 100.-args.split)) print ("len(xval_train) = {}, len(xval_test) = {}"\ .format(len(trainx), len(testx))) analyzer = similarity.Analyzer() feat = similarity.Featurizer(analyzer, pages_dict) print ("Computing feature strings:") fs, fv = feat.compute_feat_strings(trainx + testx, print_info=True) pickle.dump((trainx, testx, fs, fv, analyzer, feat), open('../data/xval_feat_strings.pkl', 'wb'), pickle.HIGHEST_PROTOCOL) elif args.load: # load pre-comuted feature strings print ("Loading precomputed feature strings for trainx and testx:") trainx, testx, fs, fv, analyzer, feat = \ pickle.load(open('../data/xval_feat_strings.pkl', 'rb')) #XXX use this one instead #feat = None #analyzer = similarity.Analyzer() #feat = similarity.Featurizer(analyzer, pages_dict) ## Here we do some cross-validation X = feat.compute_feats(fs) X = X.tocsr() # might already be CSR X.sort_indices() # needed for cosine-type measures # try some LDA stuff print ("Training LDA topic model") topic_mod = lda.LDA(n_topics=20, n_iter=150) tm_analyzer = topic_model.Analyzer() tm_feat = topic_model.Featurizer(tm_analyzer, pages_dict) # use the same feature strings as similarity tm_fs = topic_model.add_wiki_categories(trainx+testx, fs, fv, pages_dict) topic_X = tm_feat.compute_feats(tm_fs) topics = topic_mod.fit_transform(topic_X) # gives probabilities for each topic print ("Evaluating train data:") acc_trainx = test_xval(trainx, X, fv,\ #scorer=similarity.Scorer.cosine, print_info=True) scorer=similarity.Scorer.cosine, topics=topics, print_info=True) print ("Train accuracy = {}\n".format(acc_trainx)) print ("Evaluating test data:") acc_testx = test_xval(testx, X, fv,\ #scorer=similarity.Scorer.cosine, print_info=True) scorer=similarity.Scorer.cosine, topics=topics, print_info=True) print ("Test accuracy = {}\n".format(acc_testx))
# Check FASTA with open(ofile, 'r') as of: count = 0 for record in SeqIO.parse(of, 'fasta'): count += 1 assert count == len(set(ids)), "FASTA ERROR: missing IDs: %s" % (cmd) return if __name__ == "__main__": # Args ARGS = get_arg_parser().parse_args() # Tables ids = set() for tab in ARGS.tabs: tmp = pandas.read_csv(filepath_or_buffer=tab, sep='\t', header=0) assert ARGS.icol in list( tmp.columns), 'File %s has no column \"%s\"' % (tab, ARGS.icol) print('TAB %s: %d IDs' % (tab, tmp.shape[0])) ids.update(list(tmp[ARGS.icol])) print('IDS: %d' % len(ids)) # Download pool = Pool(ARGS.cores) tmp = pool.starmap( download_sequences, enumerate([chunk for chunk in split_list(ids, ARGS.size)])) pool.close() pool.join()
def __init__(self, keypoint_vector): self.key_points = utils.split_list(keypoint_vector, 3)
for file_name in file_names: scene_num = get_value_in_pattern(file_name, pattern) scene_num_set[cam_idx].add(scene_num) valid_scene_num = scene_num_set[cam_num[0]] for cam_idx in cam_num: valid_scene_num &= scene_num_set[cam_num[cam_idx]] all_scene_num = sorted(list(valid_scene_num)) #print(all_scene_num) if (len(all_scene_num) != total): print('len(all_scene_num) != total', len(all_scene_num), '!=', total) train_scene_num, test_scene_num = split_list(all_scene_num, num_train) val_scene_num, test_scene_num = split_list(test_scene_num, num_val) test_scene_num = test_scene_num[:num_test] for l in [train_scene_num, val_scene_num, test_scene_num]: l.sort() print(cnt, cls, len(train_scene_num), len(val_scene_num), len(test_scene_num), sep='\t') train_start_idx = num_cls[cls] val_start_idx = train_start_idx + len(train_scene_num) test_start_idx = val_start_idx + len(val_scene_num) num_cls[cls] = test_start_idx + len(test_scene_num)
def cross_validate(path_to_df, text_field, label_field, n_folds=5, preprocessing_function=None, additional_fields_and_preps={}, additional_data_paths=[], hyperparams={}, report_top_k=True, log_dir="./", use_gpu=False, return_models=False, seed=17, verbose=False, remove_extra_labels=True): """ :param path_to_df: str, path to csv or parquet file :param text_field: str, column of the dataframe in which is the text that should be classified :param label_field: str, column of the dataframe in which is the label of the corresponding text :param n_folds: int, number of folds :param preprocessing_function: function, function to apply on text_field column :param additional_fields_and_preps: dict. Dictionary in the following format {field_name1: preprocessing_function1, field_name2: preprocessing_function2} to enable custom preprocessing for different fields :param additional_data_paths: list of str, paths of fasttext format additional data to concat with train file :param hyperparams: dict, all hyperparams for train_supervised :param report_top_k: bool. If True will return top k scores, otherwise top 1 scores :param log_dir: str, directory to save the training files and the model :param use_gpu: bool, use gpu for training :param return_models: bool. If True will return tuple (scores, models) :param seed: int :param verbose: bool. :param remove_extra_labels: remove datapoints with labels which appear in additional_data_paths but not in train_data_path :return: list. The scores for each split """ models, scores = [], [] if path_to_df.endswith("parquet"): df = pd.read_parquet(path_to_df) else: df = pd.read_csv(path_to_df) for added_field, prep_f in additional_fields_and_preps.items(): if df[added_field].dtype != "object": df[added_field] = df[added_field].astype(str) if prep_f: df[added_field] = df[added_field].map(prep_f) df[text_field] = df[text_field] + " " + df[added_field] for fold_number, val_mask in enumerate(split_list(len(df), n_folds, seed)): train_data_path, val_data_path = preprocess_and_save(df, val_mask, text_field, label_field, preprocessing_function, additional_fields_and_preps, "./tmp_txt/", "_split{}".format(fold_number), verbose, []) if verbose: print("train path {}".format(train_data_path)) print("val path {}".format(val_data_path)) hypers_new = hyperparams.copy() if additional_fields_and_preps: hypers_new["result_dir"] = os.path.join(log_dir, "{}_{}".format(hash_function(preprocessing_function), "_".join( additional_fields_and_preps.keys()))) else: hypers_new["result_dir"] = os.path.join(log_dir, hash_function(preprocessing_function)) hypers_new["use_gpu"] = int(use_gpu) hypers_new["split_and_train_params"] = { "df_path": path_to_df, "additional_fields_and_preps": additional_fields_and_preps, "remove_extra_labels": remove_extra_labels } model = train_supervised(train_data_path=train_data_path, val_data_path=val_data_path, additional_data_paths=additional_data_paths, hyperparams=hypers_new, preprocessing_function=preprocessing_function, remove_extra_labels=remove_extra_labels, log_dir=log_dir, use_gpu=use_gpu, verbose=verbose) if report_top_k: scores.append(model.top_k_accuracy) else: scores.append(model.top_1_accuracy) if return_models: models.append(model) del model gc.collect() if return_models: return scores, models return scores