def extract_instances(self, train_instances): """ extract features to features """ self.extract_information(train_instances) features = [] infos = [] process_bar = pyprind.ProgPercent(len(train_instances)) ''' get features from train instances''' alignment_feature_file = self.feature_file.replace('IdfAlignmentFeature', 'AlignmentFeature') alignment_features = utils.create_read_file(alignment_feature_file).readlines() idf_weight = self.idf_weight default_idf_weight = min(idf_weight.values()) for train_instance, alignment_feature in zip(train_instances, alignment_features[1:]): process_bar.update() alignment_feature = alignment_feature.split('\t#\t')[1] myWordAlignments = json.loads(alignment_feature)[0] # list of [sa_idx, sb_idx] index start from 1 word_sa, word_sb = train_instance.get_word(type='lemma', lower=True) sa_aligned = [sa_idx - 1 for sa_idx, sb_idx in myWordAlignments] sb_aligned = [sb_idx - 1 for sa_idx, sb_idx in myWordAlignments] sent1_aligned = [0] * len(word_sa) sent2_aligned = [0] * len(word_sb) for sa_index in sa_aligned: sent1_aligned[sa_index] = 1 for sb_index in sb_aligned: sent2_aligned[sb_index] = 1 # calc all and aligned except stopwords sent1_sum = 0 sent2_sum = 0 sent1_ali = 0 sent2_ali = 0 for idx, word in enumerate(word_sa): weight = idf_weight.get(word, default_idf_weight) sent1_ali += sent1_aligned[idx] * weight sent1_sum += weight for idx, word in enumerate(word_sb): weight = idf_weight.get(word, default_idf_weight) sent2_ali += sent2_aligned[idx] * weight sent2_sum += weight feature = [1.0 * (sent1_ali + sent2_ali) / (sent1_sum + sent2_sum + 1e-6)] info = [sent2_ali, sent2_ali, sent1_sum, sent2_sum] features.append(feature) infos.append(info) return features, infos
def get_random(in_file, p, to_file): process_bar = pyprind.ProgPercent(12440969) with open(in_file) as fin, \ open(to_file, "w") as fout: for line in fin: process_bar.update() if random.random() < p: fout.write(line)
def load_word_embedding_std(in_vocab, emb_file, n_dim=300): """ load word embedding in a standard way: Args: in_vocab: emb_file: n_dim: Returns: word2index, embeddings """ word2index = {} embeddings = [] word2index[pad_word] = 0 embeddings.append(np.zeros(n_dim)) word2index[unk_word] = 1 embeddings.append(np.random.uniform(-0.25, 0.25, (n_dim, ))) word_index = 2 print('Load word embedding: %s' % emb_file) total_line = get_embedding_file_len(emb_file) process_bar = pyprind.ProgPercent(total_line) with open(emb_file, 'r', errors='ignore') as f: # with open(emb_file, 'r') as f: for idx, line in enumerate(f): process_bar.update() # if the first line is (vocab, ndim) if idx == 0 and len(line.split()) == 2: continue # split the line sp = line.rstrip().split() if len(sp) != n_dim + 1: print(sp[0:len(sp) - n_dim]) w = ''.join(sp[0:len(sp) - n_dim]) emb = [float(x) for x in sp[len(sp) - n_dim:]] assert len(emb) == n_dim if w in in_vocab and w not in word2index: word2index[w] = word_index embeddings.append(emb) word_index += 1 pre_trained_len = len(word2index) n_words = len(in_vocab) print('Pre-trained: {}/{} {:.2f}'.format( pre_trained_len, n_words, pre_trained_len * 100.0 / n_words)) oov_word_list = [w for w in in_vocab if w not in word2index] print('oov word list example (30): ', oov_word_list[:30]) pickle.dump(oov_word_list, open('./oov.p', 'wb')) embeddings = np.array(embeddings, dtype=np.float32) return word2index, embeddings
def main(topology): # the input to main is the path to the topology file # the output to this script saves two json files inside the downloaded tweets directory, # one json file has all the active users the other has all inactive users from the topology # user activity is based on status count and availabilty of tweets (public vs private) # # this script can be stopped and started in the middle of running it without losing progress inactive_users = read_json('dnld_tweets/inactive_users.json') active_users = read_json('dnld_tweets/active_users.json') twpy_api = auth.get_access_creds() tweets_dir = './dnld_tweets/' # put every single user (non repeating) from the topology file into a set with open(topology, 'r') as inp_file: comm_set = set(user for community in inp_file for user in ast.literal_eval(community)) # create directory for storing tweets if not os.path.exists(os.path.dirname(tweets_dir)): os.makedirs(os.path.dirname(tweets_dir), 0o755) # download tweets for every single user in the set # separate active users from inactive users based on status count and availability bar = pyprind.ProgPercent(len(comm_set), track_time=True, title='Downloading Tweets') while comm_set: user = comm_set.pop() bar.update(item_id=str(user) + '\t') if str(user) in inactive_users or str(user) in active_users: continue # skip user if they don't exist or are inactive status_count = user_status_count(user, twpy_api) if status_count <= 10: inactive_users[str(user)] = status_count write_json(tweets_dir, active_users, inactive_users) continue # skip user if already downloaded their tweets if os.path.exists(os.path.join(tweets_dir, str(user))): active_users[str(user)] = status_count write_json(tweets_dir, active_users, inactive_users) continue tweets = get_tweets(user, twpy_api) if tweets: tweet_filename = tweets_dir + str(user) write_tweets(tweets, tweet_filename) active_users[str(user)] = status_count else: inactive_users[str(user)] = 0 write_json(tweets_dir, active_users, inactive_users)
def load_word_embedding_my(word2index, emb_file, n_dim=300): """ load word embedding in a tricky way: obtain the embedding equal to the len(word2index) Args: word2index: emb_file: n_dim: Returns: word2index, embeddings """ print('Load word embedding: %s' % emb_file) pre_trained = {} n_words = len(word2index) embeddings = np.random.uniform(-0.25, 0.25, (n_words, n_dim)) embeddings[0, ] = np.zeros(n_dim) total_line = get_embedding_file_len(emb_file) process_bar = pyprind.ProgPercent(total_line) # with open(emb_file, 'r', errors='ignore') as f: with open(emb_file, 'r') as f: for idx, line in enumerate(f): process_bar.update() # if the first line is (vocab, ndim) if idx == 0 and len(line.split()) == 2: continue # split the line sp = line.rstrip().split() if len(sp) != n_dim + 1: print(sp[0:len(sp) - n_dim]) w = ''.join(sp[0:len(sp) - n_dim]) emb = [float(x) for x in sp[len(sp) - n_dim:]] if w in word2index and w not in pre_trained: embeddings[word2index[w]] = emb pre_trained[w] = 1 pre_trained_len = len(pre_trained) print('Pre-trained: {}/{} {:.2f}'.format( pre_trained_len, n_words, pre_trained_len * 100.0 / n_words)) oov_word_list = [w for w in word2index if w not in pre_trained] print('oov word list example (30): ', oov_word_list[:30]) pickle.dump(oov_word_list, open('./oov.p', 'wb')) embeddings = np.array(embeddings, dtype=np.float32) return word2index, embeddings
def load_parse_data(train_file, parser=None, flag=False): """ Load data after Parse, like POS, NER, etc. Value: [ SentPair:class, ... ] Parameter: flag: False(Default), Load from file (resources....) True, Parse and Write to file, and then load from file """ ''' Pre-Define Write File ''' # parse_train_file = config.PARSE_DIR + '/' + \ # utils.FileManager.get_file(train_file) parse_train_file = train_file.replace('./data', './generate/parse') if flag or not os.path.isfile(parse_train_file): print(train_file) if parser is None: raise RuntimeError( "parser should be init by ``nlp = stst.StanfordNLP('http://localhost:9000')``" ) ''' Parse Data ''' data = load_STS(train_file) print('*' * 50) print("Parse Data, train_file=%s, n_train=%d\n" % (train_file, len(data))) parse_data = [] process_bar = pyprind.ProgPercent(len(data)) for (sa, sb, score) in data: process_bar.update() parse_sa = parser.parse(sa) parse_sb = parser.parse(sb) parse_data.append((parse_sa, parse_sb, score)) ''' Write Data to File ''' with utils.create_write_file(parse_train_file) as f_parse: for parse_instance in parse_data: line = json.dumps(parse_instance) print(line, file=f_parse) ''' Load Data from File ''' print('*' * 50) parse_data = [] with utils.create_read_file(parse_train_file) as f: for line in f: parse_json = json.loads(line) sentpair_instance = SentPair(parse_json) parse_data.append(sentpair_instance) print("Load Data, train_file=%s, n_train=%d\n" % (train_file, len(parse_data))) return parse_data
def example_2(): n = 1000000 my_per = pyprind.ProgPercent(n, stream=1, track_time=True, title='My Percent Indicator', monitor=True) for i in range(n): # do some computation my_per.update() print('\n\nPrint tracking object ...\n') print(my_per)
def train(model, train_data_loader, optimizer_model, optimizer_step, criterion, use_cuda, epoch): totalloss_meter = AverageMeter() psnr_meter = AverageMeter() # hloss_meter = AverageMeter() # switch to train mode model.train() # progress bar init bar = pyprind.ProgPercent(len(train_data_loader), update_interval=1) tic = time.time() for batch, (batch_data) in enumerate(train_data_loader): batch_data = batch_data.cuda() # Compute gradient and do optimizer step optimizer_model.zero_grad() # Compute output predictions, encoded = model(batch_data) # hloss = criterion(batch_data, encoded) # Calculate loss total_loss = criterion(batch_data, predictions) # + hloss #, L1E, KLD, CCE, ACC1, ACC2 total_loss.backward() optimizer_model.step() if optimizer_step is not None: optimizer_step.step() toc = time.time() - tic tic = time.time() psnr_ = psnr_metric(batch_data, predictions) # Metrics totalloss_meter.update(total_loss.data.cpu().item(), batch_data.shape[0]) psnr_meter.update(psnr_.data.cpu().item(), batch_data.shape[0]) # hloss_meter.update(hloss.data.cpu().item(), batch_data.shape[0]) # Update log progress bar log_ = ' loss:' + '{0:4.4f}'.format(totalloss_meter.avg) log_ += ' psnr:' + '{0:4.4f}'.format(psnr_meter.avg) # log_ += ' hloss:'+ '{0:4.4f}'.format(hloss_meter.avg) log_ += ' batch time:' + '{0:2.3f}'.format(toc) bar.update(item_id=log_) del total_loss, batch_data, predictions return totalloss_meter.avg, psnr_meter.avg
def parse_data(data): """parse data Returns: outputs: list of (parse_sa, parse_sb, score) """ outputs = [] process_bar = pyprind.ProgPercent(len(data), title='parse the data') for example in data: process_bar.update() sa, sb, score = example parse_sa, parse_sb = nlp.parse(sa), nlp.parse(sb) outputs.append((parse_sa, parse_sb, score)) return outputs
def tagging_train_data(in_file, to_file, statistics_file): dict_P_to_count = {} NUM_NEAGTIVE = 100000 process_bar = pyprind.ProgPercent(1175201) with open(in_file) as fin, \ open(to_file, "w") as fout: for line in fin: process_bar.update() line_list = line.strip().split("\t") dict_label_info_string = line_list[-1] dict_label_info = ujson.loads(dict_label_info_string) # 加载每个P的正负样本: for P in dict_label_info: if P not in dict_P_to_count: dict_P_to_count[P] = {} dict_P_to_count[P]["positive"] = 0 dict_P_to_count[P]["negative"] = 0 for s_o in dict_label_info[P]["candidates"]: label = dict_label_info[P]["candidates"][s_o]["label"] # 正例 if label > 0: dict_P_to_count[P]["positive"] += 1 dict_label_info[P]["candidates"][s_o]["wanted"] = 1 else: # 负例 if label < 0 and dict_P_to_count[P][ "negative"] < NUM_NEAGTIVE: dict_P_to_count[P]["negative"] += 1 dict_label_info[P]["candidates"][s_o]["wanted"] = 1 else: dict_label_info[P]["candidates"][s_o]["wanted"] = 0 out_line_list = line_list[:-1] + [ ujson.dumps(dict_label_info, ensure_ascii=False) ] fout.write("\t".join(out_line_list) + "\n") with open(statistics_file, "w") as fout: for P in dict_P_to_count: fout.write("%s\t%d\t%d\n" % (P, dict_P_to_count[P]["positive"], dict_P_to_count[P]["negative"]))
def upload_to_s3(s3_path, file_name=None, file_text=None, acl='private'): """Uploads a file to S3 Args: s3_path(S3Path): Output path of the file to be uploaded file_name(str): Name of the file to be uploaded to s3 file_text(str): Contents of the file to be uploaded acl(str): ACL policy of the file on S3 """ if not isinstance(s3_path, S3Path): raise ETLInputError('Input path should be of type S3Path') if not any([file_name, file_text]): raise ETLInputError('File_name or text should be given') if file_name: source_size = os.stat(file_name).st_size else: source_size = len(file_text) bar = None cb = None bucket = get_s3_bucket(s3_path.bucket) if s3_path.is_directory: key_name = os.path.join(s3_path.key, os.path.basename(file_name)) else: key_name = s3_path.key key = bucket.new_key(key_name) if file_name: if source_size > LARGE_FILE_LIMIT: _multipart_upload(bucket, key_name, file_name) else: if source_size > CHUNK_SIZE: bar = pyprind.ProgPercent(PROGRESS_SECTIONS, monitor=True, title='Uploading %s' % file_name) def _callback(current, total): bar.update() cb = _callback key.set_contents_from_filename(file_name, cb=cb, num_cb=PROGRESS_SECTIONS, policy=acl) else: key.set_contents_from_string(file_text, cb=cb, num_cb=PROGRESS_SECTIONS, policy=acl)
def aligner(parsed_data): """aligner sim(sa, sb) = \frac{sa_{aligned} + sb_{aligned}}{sa_{all} + sb_{all}} """ preds = [] golds = [] process_bar = pyprind.ProgPercent(len(parsed_data)) for example in parsed_data: process_bar.update() parse_sa, parse_sb, score = example features, infos = align_feats(parse_sa, parse_sb) preds.append(features[0]) golds.append(score) return preds, golds
def load_embed_from_text(emb_file, n_dim): """ load_embedding from raw text Args: emb_file: n_dim: Returns: word2index: dict embeddings: numpy """ print('Load word embedding: %s' % emb_file) word2index = {} embeddings = [] word2index[pad_word] = 0 embeddings.append(np.zeros(n_dim)) word2index[unk_word] = 1 embeddings.append(np.random.uniform(-0.25, 0.25, (n_dim, ))) word_index = 2 total_line = get_embedding_file_len(emb_file) process_bar = pyprind.ProgPercent(total_line) with open(emb_file, 'r') as f: for idx, line in enumerate(f): process_bar.update() # if the first line is (vocab, ndim) if idx == 0 and len(line.split()) == 2: print('embedding info: ', line) continue # split the line sp = line.rstrip().split() if len(sp) != n_dim + 1: print(sp[0:len(sp) - n_dim]) w = ''.join(sp[0:len(sp) - n_dim]) emb = [float(x) for x in sp[len(sp) - n_dim:]] word2index[w] = word_index embeddings.append(emb) word_index += 1 embeddings = np.array(embeddings, dtype=np.float32) print('finished load input embed!!') return word2index, embeddings
def get_dict_author_keyword_freq(dict_author_keywords_path, to_file): dict_author_keywords_freq = {} dict_author_keywords = json.load(open(dict_author_keywords_path), encoding="utf-8") bar = pyprind.ProgPercent(len(dict_author_keywords)) for autherId in dict_author_keywords: if autherId not in dict_author_keywords_freq: dict_author_keywords_freq[autherId] = Counter() keywords = dict_author_keywords[autherId] for keyword in keywords: dict_author_keywords_freq[autherId][keyword] += 1 bar.update() print "dump..." json.dump(dict_author_keywords_freq, open(to_file, "w"), encoding="utf-8")
def extract_instances(self, train_instances): """ extract features to features """ # first extract information from train_instance # for only be used to extract data_set information and can reuse the pyprind self.extract_information(train_instances) features = [] infos = [] process_bar = pyprind.ProgPercent(len(train_instances)) for train_instance in train_instances: process_bar.update() feature, info = self.extract(train_instance) ##可变参数进行传递! features.append(feature) infos.append(info) return features, infos
def load_parse_data(train_file, nlp=None, flag=False): """ Load data after Parse, like POS, NER, etc. Value: [ SentPair:class, ... ] Parameter: flag: False(Default), Load from file (resources....) True, Parse and Write to file, and then load from file """ ''' Pre-Define Write File ''' # parse_train_file = config.PARSE_DIR + '/' + \ # utils.FileManager.get_file(train_file) parse_train_file = train_file.replace('./data', './generate/parse') if flag or not os.path.isfile(parse_train_file): print(train_file) ''' Parse Data ''' data = load_data(train_file) print('*' * 50) print("Parse Data, train_file=%s, n_train=%d\n" % (train_file, len(data))) parse_data = [] process_bar = pyprind.ProgPercent(len(data)) for (sent, label) in data: process_bar.update() sent = preprocess(sent) parse_data.append((sent, label)) ''' Write Data to File ''' with utils.create_write_file(parse_train_file) as f_parse: for parse_instance in parse_data: line = json.dumps(parse_instance, ensure_ascii=False) print(line, file=f_parse) ''' Load Data from File ''' print('*' * 50) parse_data = [] with utils.create_read_file(parse_train_file) as f: for line in f: sent, label = json.loads(line) sentpair_instance = Sent(sent, label) parse_data.append(sentpair_instance) print("Load Data, train_file=%s, n_train=%d\n" % (train_file, len(parse_data))) return parse_data
def train(model, train_data_loader, optimizer_model, optimizer_step, criterion, use_cuda, epoch): totalloss_meter = AverageMeter() psnr_meter = AverageMeter() # switch to train mode model.train() # progress bar init bar = pyprind.ProgPercent(len(train_data_loader), update_interval=1) tic = time.time() for batch, (input_image) in enumerate(train_data_loader): input_image = input_image.cuda() # Compute gradient and do optimizer step optimizer_model.zero_grad() # Compute output recon_image, encode_master, encode_student = model(input_image) # Calculate loss total_loss = 0.01 * criterion(input_image, recon_image) +\ criterion(encode_master, encode_student)#, target_var #, L1E, KLD, CCE, ACC1, ACC2 total_loss.backward() optimizer_model.step() if optimizer_step is not None: optimizer_step.step() toc = time.time() - tic tic = time.time() psnr_ = psnr_metric(input_image, recon_image) # Metrics totalloss_meter.update(total_loss.data.cpu().item(), input_image.shape[0]) psnr_meter.update(psnr_.data.cpu().item(), input_image.shape[0]) # Update log progress bar log_ = ' loss:' + '{0:4.4f}'.format(totalloss_meter.avg) log_ += ' psnr:' + '{0:4.4f}'.format(psnr_meter.avg) log_ += ' batch time:' + '{0:2.3f}'.format(toc) bar.update(item_id=log_) del total_loss, input_image, recon_image, encode_master, encode_student return totalloss_meter.avg, psnr_meter.avg
def main(topology): inactive_users = read_json('dnld_tweets/inactive_users.json') active_users = read_json('dnld_tweets/active_users.json') _, app_auths = auth.get_access_creds() tweets_dir = './dnld_tweets/' with open(topology, 'r') as inp_file: comm_set = set(user for community in inp_file for user in ast.literal_eval(community)) if not os.path.exists(os.path.dirname(tweets_dir)): os.makedirs(os.path.dirname(tweets_dir), 0o755) bar = pyprind.ProgPercent(len(comm_set), track_time=True, title='Downloading Tweets') while comm_set: user = comm_set.pop() bar.update(item_id=user) if str(user) in inactive_users: continue api = auth.manage_auth_handlers(app_auths) # skip user if they don't exist or are inactive status_count = user_status_count(user, api) if status_count <= 10: inactive_users[str(user)] = status_count write_json(tweets_dir, active_users, inactive_users) continue # skip user if you've already downloaded their tweets if os.path.exists(os.path.join(tweets_dir, str(user))): active_users[str(user)] = status_count write_json(tweets_dir, active_users, inactive_users) continue tweets = get_tweets(user, api) if tweets: tweet_filename = tweets_dir + str(user) write_tweets(tweets, tweet_filename) active_users[str(user)] = status_count else: inactive_users[str(user)] = 0 write_json(tweets_dir, active_users, inactive_users)
def get_user_followers(twpy_api, user_ids): # returns the followers of each user {user: [followers]} and also updates/returns user ids followers = user_ids user_followers = {} bar = pyprind.ProgPercent(len(user_ids), track_time=True, title='Finding user followers') for user in user_ids: bar.update(item_id=str(user) + '\t') try: # protected tweets or user doesn't exist user_followers[user] = twpy_api.followers_ids(id=user) followers.extend(user_followers[user]) except: print("Skipping user: " + str(user)) return set(followers), user_followers
def get_top_k_coauthors(paper_author_path, k, to_file): data = util.read_dict_from_csv(paper_author_path) dict_paperId_to_authors = {} bar = pyprind.ProgPercent(len(data)) for item in data: paperId = int(item["PaperId"]) authorId = int(item["AuthorId"]) if paperId not in dict_paperId_to_authors: dict_paperId_to_authors[paperId] = [] dict_paperId_to_authors[paperId].append(authorId) bar.update() print "dump..." json.dump(dict_paperId_to_authors, open(to_file, "w"), encoding="utf-8")
def make_frames(z): files = [] tmpdir = tempfile.mkdtemp() if verbose: print('Saving sequence ' + filename + ' as a ' + vext + ' format') pbar = progressbar.ProgPercent(N_frame, monitor=True) for frame in range(N_frame): if verbose: pbar.update() fname = 'frame%06d.png' % frame full_fname = os.path.join(tmpdir, fname) image = np.rot90(z[..., frame]) imageio.imsave(full_fname, (image*255).astype(np.uint8), compression=0, quantize=256) files.append(fname) if verbose: print(pbar) return tmpdir, files
def aligning_documents_by_interlanguage_links(source_corpus_file, target_corpus_file, source_language, target_language, output_path): if not output_path.endswith('/'): output_path = output_path + '/' check_dir(output_path) # if directory does not exist, then create logging.info( 'aliging %s and %s wikipeida documents using interlanguage links', source_language, target_language) source_docs = split_wikipedia_docs_into_array(source_corpus_file) logging.info('source corpus is loaded') target_docs = split_wikipedia_docs_into_array(target_corpus_file) logging.info('target corpus is loaded') target_titles = [ get_title_from_interlanguage_links(d, source_language) for d in target_docs ] logging.info('start aligning...') source_out = open(output_path + source_language + '-wiki.txt', 'w') target_out = open(output_path + target_language + '-wiki.txt', 'w') count = 1 my_prperc = pyprind.ProgPercent(len(source_docs)) for i in range(len(source_docs)): my_prperc.update() # print progress source_title = get_title_from_interlanguage_links( source_docs[i], source_language) try: index = target_titles.index(source_title) text_out = source_docs[i] print(text_out.encode('utf-8'), file=source_out) text_out = target_docs[index] print(text_out.encode('utf-8'), file=target_out) count += 1 except: continue logging.info( 'aliging by document interlanguage links is done! ... \n %d documents are aligned', count)
def img_to_jpeg(img_path): """ Transform every image in path into a png one for making it compatible with TF pre-trained NN :param img_path: path to image folder :return: None. It writes images on disk """ img_found_counter = 0 img_converted_counter = 0 if not os.path.isdir(img_path): raise InputError('{} is not a valid path'.format(img_path)) for (gen_path, bmp_paths, img_names) in os.walk(img_path): bar = pyprind.ProgPercent(len(img_names)) # print(gen_path, bmp_paths, img_names) for file_name in img_names: if not file_name.endswith(IMAGES_EXTENSION): file_no_extension = os.path.splitext(file_name)[0] # file_no_extension = file_name.replace('.bmp', '') img_found_counter += 1 # if (file_no_extension + IMAGES_EXTENSION) not in img_names: if True: logger.info('Now processing: {}'.format(file_name)) file_path = os.path.join(gen_path, file_name) if not os.path.isfile(file_path): raise InputError( '{} is not a valid image'.format(file_path)) with Image.open(file_path) as img: img = img.convert('L') img = uglify_image(img) # path is valid as it has been checked before img.save(os.path.join( gen_path, file_no_extension + IMAGES_EXTENSION), IMAGES_EXTENSION.replace('.', ''), dpi=(DPI_EXTRACTION, DPI_EXTRACTION)) img_converted_counter += 1 logger.info('{} succesfully written on disk!'.format( file_name)) bar.update() if img_found_counter == 0: logger.warning('No img to convert found!') else: if img_converted_counter == 0: logger.info('No img to convert left!')
def get_data(server, dataUrls, category, dataset, dbconn): data = [] category_keys = [str(a) for a in category.keys()] percentage = pyprind.ProgPercent(len(dataUrls)) #Progress bar for dataUrl in dataUrls: response = server.get(dataUrl).json() if 'dataValues' in response.keys(): values = response['dataValues'] for value in values: categoryOptionCombo = value['categoryOptionCombo'].lower() if 'value' in value.keys( ) and categoryOptionCombo in category_keys: data.append([ value['orgUnit'], value['period'], value['dataElement'], category[categoryOptionCombo], value['value'] ]) #Update progress bar percentage.update() return data
def prepare_author_keywords(aid_pid_path, paper_path): aid_pid = util.read_dict_from_csv(aid_pid_path) paper = util.read_dict_from_csv(paper_path) print 'finish loading csv file' max_aid, max_pid = 0, 0 for item in aid_pid: max_aid = max(max_aid, int(item["AuthorId"])) max_pid = max(max_pid, int(item['PaperId'])) for item in paper: max_pid = max(max_pid, int(item["Id"])) print 'max_aid', max_aid, 'max_pid', max_pid paper_indexd_list = range(max_pid + 1) for item in paper: paper_indexd_list[int(item["Id"])] = item del paper keywords = [[]] * (max_aid + 1) bar = pyprind.ProgPercent(len(aid_pid)) for item in aid_pid: bar.update() aid = int(item['AuthorId']) pid = int(item['PaperId']) if pid != paper_indexd_list[pid]: kw = paper_indexd_list[pid]["Keyword"] if kw: keywords[aid].append(kw) author_keywords = [] for index, item in enumerate(keywords): dic = {} if item: dic["AuthorId"] = str(index) dic["Keywords"] = " ".join(item) author_keywords.append(dic) del keywords write_dict_to_csv(['AuthorId', 'Keywords'], author_keywords, config.AUTHOR_KEYWORDS_FILE) print 'finish writing author_keywords csv in', config.AUTHOR_KEYWORES_FILE
def evalModels(models, test_loader, testing_mode=False, return_y=False): test_correct = {key: 0.0 for key in models} if return_y: y_pred = {key: torch.Tensor([]).long() for key in models} y_true = torch.Tensor([]).long() bar = pyprind.ProgPercent(len(test_loader.dataset), title="Testing epoch : ") for model in models.values(): model.train(testing_mode) with torch.no_grad(): for idx, data in enumerate(test_loader): x, y = data inputs = x.to(device) labels = y.to(device) if return_y: y_true = torch.cat((y_true, y.long().view(-1))) for key, model in models.items(): outputs = model(inputs) test_correct[key] += (torch.max( outputs, 1)[1] == labels.long().view(-1)).sum().item() if return_y: y_pred[key] = torch.cat( (y_pred[key], torch.max(outputs, 1)[1].to(torch.device('cpu')).long())) bar.update(test_loader.batch_size) #clear_output(wait=True) #print('Testing batch : {:.3f} %'.format( # ((idx+1)*test_loader.batch_size*100) / len(test_loader.dataset) #)) if return_y: return test_correct, y_true, y_pred else: return test_correct
def get_1w_positive_5w_negative(in_file, to_file): fout = open(to_file, "w") count = {} process_bar = pyprind.ProgPercent(3500000) for line in open(in_file): process_bar.update() wanted = False label_info = json.loads(line.split("\t")[-1]) for P in label_info: if P not in count: count[P] = {} count[P]["positive"] = 0 count[P]["negative"] = 0 count[P]["NULL"] = 0 for so in label_info[P]["candidates"]: label = label_info[P]["candidates"][so]["label"] if label > 0 and count[P]["positive"] < 10000: wanted = True count[P]["positive"] += 1 if label < 0 and count[P]["negative"] < 50000: wanted = True count[P]["negative"] += 1 if label == 0: count[P]["NULL"] += 1 if wanted: fout.write(line) fout.close()
def get_dict_auther_keywords(paper_path, paper_author_path, k, to_file): data_paper = util.read_dict_from_csv(paper_path) dict_paper_author = json.load(open(paper_author_path), encoding="utf-8") #print(dict_paper_author["1048576"]) dict_auther_keywords = {} print("start...") bar = pyprind.ProgPercent(len(data_paper)) for item in data_paper: paperId = int(item["Id"]) title = item["Title"] keywords = item["Keyword"] key = util.get_string_splited(title + " " + keywords) for authorId in dict_paper_author[str(paperId)]: if authorId not in dict_auther_keywords: dict_auther_keywords[authorId] = [] dict_auther_keywords[authorId].extend(key) bar.update() print "dump..." json.dump(dict_auther_keywords, open(to_file, "w"), encoding="utf-8")
def export(self, filename=None, **kwargs): if not filename: raise Exception('The "filename" parameter is required to export.') filename = os.path.expanduser(filename) from solvebio import Dataset result_count = len(self.query) if result_count <= 0: raise AttributeError('No results found in query!') self.rows = [] self.key_map = OrderedDict() self.key_types = {} for f in Dataset.retrieve(self.query._dataset_id).fields(limit=1000): name = f['name'] splits = [int(s) if s.isdigit() else s for s in name.split('.')] self.key_map[name] = splits self.key_types[name] = f['data_type'] title = 'Exporting query to: {0}'.format(filename) if self.show_progress: progress_bar = pyprind.ProgPercent(result_count, title=title, track_time=False) else: print(title) for ind, record in enumerate(self.query): row = self.process_record(record) self.rows.append(row) if self.show_progress: progress_bar.update() self.write(filename=filename) print('Export complete!')
def encode_word_dictionary(input_files, output_file): word_file_name, alphabet_file_name = input_files with open(alphabet_file_name) as alphabet_file: alphabet = json.loads(alphabet_file.read()) with open(word_file_name) as word_file: words = [word.rstrip() for word in word_file] progress_bar = pyprind.ProgPercent(len(words)) encoded_words = [] for word in words: encoded_word = [alphabet[c] for c in word] encoded_word = [alphabet['START']] + encoded_word + [alphabet['END']] encoded_words.append(encoded_word) progress_bar.update() with open(output_file, 'w') as f: f.write(json.dumps(encoded_words))