def test_knn(): dataset = pickle.load(open("dataset.obj", "rb")) n_classes = len(dataset.get_classes()) start = time.time() predictions = knn.knn(dataset) end = time.time() elapsed_time = utils.humanize_time(end - start) print("Elapsed time using knn {0}...".format(elapsed_time)) print("predictions = \n{0}".format(predictions)) utils.write_list(predictions, "results/knn-predictions.txt") # predictions = [ # [1, 1, 0, 2, 4, 3, 2, 0, 2, 4, 0, 3, 2, 1, 1], # [1, 2, 4, 2, 1, 0, 4, 1, 3, 2, 2, 2, 1, 2, 1], # [2, 3, 4, 2, 2, 0, 2, 0, 3, 3, 1, 2, 2, 2, 3], # [0, 1, 3, 3, 3, 3, 1, 3, 3, 3, 2, 2, 3, 0, 1], # [3, 0, 2, 1, 4, 2, 1, 0, 2, 4, 1, 1, 4, 2, 3] # ] hist = np.zeros((n_classes, n_classes), dtype=np.uint16) for i in range(len(predictions)): for j in range(len(predictions[i])): c = predictions[i][j] hist[i][c] += 1 print("hist = \n{0}".format(hist)) np.savetxt("results/knn-hist.csv", hist, fmt="%i", delimiter=",") confusion_matrix = hist / 25.0 print("conf mat = \n{0}".format(confusion_matrix)) values = [confusion_matrix[i][i] for i in range(n_classes)] precision = np.average(values) print("precision = {0}".format(precision)) plt.matshow(confusion_matrix) plt.title('Confusion matrix') plt.colorbar() plt.show()
def _make(token): keys = read_list( os.path.join(data_root, 'clean_{}_key_list.txt'.format(token))) lines = [k + ' ' + label_kv[k] for k in keys] np.random.shuffle(lines) output_file = os.path.join(output_dir, 'clean_{}.txt'.format(token)) write_list(lines, output_file)
def process(user_id, all=False): print('process user_id=%s, all=%s' % (user_id, all)) medias = get_medias(user_id, all) for m in medias: d = to_dict(m) pprint.pprint(d) if all: print('medias total: %s' % len(medias)) urls = [m.get_standard_resolution_url() for m in medias] write_list(u'%s_list.txt' % user_id, urls) download_insta_files(get_medias(user_id, all), output=user_id)
def make_data(data, labels, output_dir, prefix): image_dir = prefix + '_images/' mkdir_if_missing(osp.join(output_dir, image_dir)) file_label_list = [] num = len(data) for i in xrange(num): img = np.rollaxis(data[i, :].reshape((3, 32, 32)), 0, 3) filename = '{:05d}.jpg'.format(i) imsave(osp.join(output_dir, image_dir, filename), img) file_label_list.append('{} {}'.format(osp.join(image_dir, filename), int(labels[i]))) write_list(file_label_list, osp.join(output_dir, prefix + '.txt'))
def make_data(data, labels, output_dir, prefix): image_dir = prefix + '_images/' mkdir_if_missing(osp.join(output_dir, image_dir)) file_label_list = [] num = len(data) for i in xrange(num): img = np.rollaxis(data[i, :].reshape((3, 32, 32)), 0, 3) filename = '{:05d}.jpg'.format(i) imsave(osp.join(output_dir, image_dir, filename), img) file_label_list.append('{} {}'.format( osp.join(image_dir, filename), int(labels[i]))) write_list(file_label_list, osp.join(output_dir, prefix + '.txt'))
def store_labels(self, train_labels, test_labels): ## Store the labels in a text file print("Storing the labels in text files...") start = time.time() train_labels_fn = "results/train_labels_{0}.txt".format(self.n_classes) test_labels_fn = "results/test_labels_{0}.txt".format(self.n_classes) utils.write_list(train_labels, train_labels_fn) utils.write_list(test_labels, test_labels_fn) end = time.time() s = "Elapsed time storing the labels {0} secs.".format(end - start) self.log += s + "\n" print(s)
def make_aux_mixed(data_root, output_dir, upsample_ratio=1.0): ntype_kv = dict(zip(*read_kv(os.path.join(output_dir, 'ntype_train.txt')))) clean_kv = dict( zip(*read_kv(os.path.join(data_root, 'clean_label_kv.txt')))) noisy_kv = dict( zip(*read_kv(os.path.join(data_root, 'noisy_label_kv.txt')))) clean_keys = read_list(os.path.join(data_root, 'clean_train_key_list.txt')) noisy_keys = read_list(os.path.join(data_root, 'noisy_train_key_list.txt')) # upsampling clean keys to ratio * #noisy_keys clean_keys = np.random.choice(clean_keys, len(noisy_keys) * upsample_ratio) # mix clean and noisy data keys = list(clean_keys) + list(noisy_keys) np.random.shuffle(keys) clean, noisy, ntype = [], [], [] for k in keys: if k in clean_kv: clean.append(clean_kv[k]) noisy.append('-1') else: clean.append('-1') noisy.append(noisy_kv[k]) if k in ntype_kv: ntype.append(ntype_kv[k]) else: ntype.append('-1') keys = [k + ' -1' for k in keys] write_list(keys, os.path.join(output_dir, 'mixed_train_images.txt')) write_list(clean, os.path.join(output_dir, 'mixed_train_label_clean.txt')) write_list(noisy, os.path.join(output_dir, 'mixed_train_label_noisy.txt')) write_list(ntype, os.path.join(output_dir, 'mixed_train_label_ntype.txt'))
def make_aux_mixed(data_root, output_dir, upsample_ratio=1.0): ntype_kv = dict(zip(*read_kv(os.path.join(output_dir, "ntype_train.txt")))) clean_kv = dict(zip(*read_kv(os.path.join(data_root, "clean_label_kv.txt")))) noisy_kv = dict(zip(*read_kv(os.path.join(data_root, "noisy_label_kv.txt")))) clean_keys = read_list(os.path.join(data_root, "clean_train_key_list.txt")) noisy_keys = read_list(os.path.join(data_root, "noisy_train_key_list.txt")) # upsampling clean keys to ratio * #noisy_keys clean_keys = np.random.choice(clean_keys, len(noisy_keys) * upsample_ratio) # mix clean and noisy data keys = list(clean_keys) + list(noisy_keys) np.random.shuffle(keys) clean, noisy, ntype = [], [], [] for k in keys: if k in clean_kv: clean.append(clean_kv[k]) noisy.append("-1") else: clean.append("-1") noisy.append(noisy_kv[k]) if k in ntype_kv: ntype.append(ntype_kv[k]) else: ntype.append("-1") keys = [k + " -1" for k in keys] write_list(keys, os.path.join(output_dir, "mixed_train_images.txt")) write_list(clean, os.path.join(output_dir, "mixed_train_label_clean.txt")) write_list(noisy, os.path.join(output_dir, "mixed_train_label_noisy.txt")) write_list(ntype, os.path.join(output_dir, "mixed_train_label_ntype.txt"))
def _make(keys, token): clean_labels = np.asarray([int(clean_kv[k]) for k in keys]) noisy_labels = np.asarray([int(noisy_kv[k]) for k in keys]) lines = [] alpha = 1.0 / (C.shape[0] - 1) for key, y, y_tilde in zip(keys, clean_labels, noisy_labels): if y == y_tilde: lines.append(key + " 0") elif alpha >= C[y_tilde][y]: lines.append(key + " 1") else: lines.append(key + " 2") np.random.shuffle(lines) output_file = os.path.join(output_dir, "ntype_{}.txt".format(token)) write_list(lines, output_file)
def _make(keys, token): clean_labels = np.asarray([int(clean_kv[k]) for k in keys]) noisy_labels = np.asarray([int(noisy_kv[k]) for k in keys]) lines = [] alpha = 1.0 / (C.shape[0] - 1) for key, y, y_tilde in zip(keys, clean_labels, noisy_labels): if y == y_tilde: lines.append(key + ' 0') elif alpha >= C[y_tilde][y]: lines.append(key + ' 1') else: lines.append(key + ' 2') np.random.shuffle(lines) output_file = os.path.join(output_dir, 'ntype_{}.txt'.format(token)) write_list(lines, output_file)
def fetch_all(total): list_file = 'jiusong_all.txt' page_file = 'jiusong_p_%s.txt' all_urls = [] for i in range(1, total): try: urls = fetch_page(i) if urls: all_urls.extend(urls) write_list(page_file % i, urls) time.sleep(random.randint(1, 2)) except KeyboardInterrupt, e: print("User interrupt, quit.") raise except Exception, e: print("Error:%s On fetch page %s" % (e, i)) traceback.print_exc() time.sleep(10)
def main(args): size = args.size q = generate_matrix_q(args.level) write_matrix( q, osp.join(args.data_root, 'matrix_q' + repr(args.level) + '.txt')) pickle(q, osp.join(args.data_root, 'matrix_q' + repr(args.level) + '.pkl')) files, labels = parse(osp.join(args.data_root, 'train.txt')) noisy_labels = corrupt(labels, q) write_file_label_list(files[:size], labels[:size], osp.join(args.data_root, 'clean.txt')) write_file_label_list( files[:size], noisy_labels[:size], osp.join(args.data_root, 'noisy_' + repr(args.level) + '.txt')) write_list( noisy_labels[:size], osp.join(args.data_root, 'labels_noisy_' + repr(args.level) + '.txt')) write_list([f + ' -1' for f in files[:size]], osp.join(args.data_root, 'images.txt'))
def main(args): id_offset = 0 merged_train_kv = {} merged_val_kv = {} for dataset_dir, db_dir in zip(args.dataset_dirs, args.db_dirs): train_files, train_labels = read_kv(osp.join(db_dir, 'train.txt')) val_files, val_labels = read_kv(osp.join(db_dir, 'val.txt')) unique_ids = set(map(int, train_labels + val_labels)) id_mapping = {idx: i + id_offset for i, idx in enumerate(unique_ids)} for k, v in zip(train_files, train_labels): merged_train_kv[osp.join(dataset_dir, k)] = id_mapping[int(v)] for k, v in zip(val_files, val_labels): merged_val_kv[osp.join(dataset_dir, k)] = id_mapping[int(v)] id_offset += len(id_mapping) mkdir_if_missing(osp.join(args.output_dir)) train_list = [k + ' ' + str(v) for k, v in merged_train_kv.iteritems()] np.random.shuffle(train_list) write_list(train_list, osp.join(args.output_dir, 'train.txt')) write_kv(merged_val_kv.keys(), map(str, merged_val_kv.values()), osp.join(args.output_dir, 'val.txt')) print "Max ID:", id_offset
def metrics(self, rankings, train_labels, test_labels, sorted_prods): ### Calculates classification and products set and position mAP ### ###------------------------------------------------------------------### print("Starting to calculate metrics ...") start = time.time() rel_ranks = [] for i in range(len(rankings)): rel_ranks.append( utils.relevance_ranking(rankings[i], train_labels, test_labels[i])) # Classification mAP #----------------------------------------------------------------------- class_ap = [utils.class_ap(rel_rk) for rel_rk in rel_ranks] class_ap_filename = "results/class_avg_precs_{0}.txt".format( self.n_classes) utils.write_list(class_ap, class_ap_filename) class_map = np.mean(class_ap) self.log += "ranking size = {0}".format(len(rankings[0])) + "\n" s = "classification mean average precision = {0}".format(class_map) self.log += s + "\n" print(s) # Dot products average precision #----------------------------------------------------------------------- # Set set_prec = [] for i in range(len(rankings)): indices = [prods[0] for prods in sorted_prods[i]] precision = utils.prod_set_prec(indices, rankings[i]) set_prec.append(precision) set_ap_filename = "results/set_avg_precs_{0}.txt".format( self.n_classes) utils.write_list(set_prec, set_ap_filename) set_map = np.mean(set_prec) s = "set mean average precision = {0}".format(set_map) self.log += s + "\n" print(s) # Position pos_prec = [] for i in range(len(rankings)): indices = [prods[0] for prods in sorted_prods[i]] precision = utils.prod_pos_prec(indices, rankings[i]) pos_prec.append(precision) pos_ap_filename = "results/pos_avg_precs_{0}.txt".format( self.n_classes) utils.write_list(pos_prec, pos_ap_filename) pos_map = np.mean(pos_prec) s = "position mean average precision = {0}".format(pos_map) self.log += s + "\n" print(s) end = time.time() elapsed_time = utils.humanize_time(end - start) s = "Elapsed time calculating metrics: {0}".format(elapsed_time) self.log += s + "\n" print(s)
def upload_photos_to_album(album_id, photos): print('Upload photos to album %s' % album_id) if os.path.isfile(photos): files = [os.path.basename(photos)] output = os.path.dirname(photos) else: files = os.listdir(photos) output = photos done_file = os.path.join(output, '%s_done.txt' % album_id) finished = read_list(done_file) error_count = 0 for f in files: image = os.path.join(output, f) _, ext = os.path.splitext(f) if not ext or ext.lower() not in ['.jpg', '.png', '.gif']: # print('Invalid %s' % image) continue try: if f not in finished: print('Uploading %s' % image) api.photo_upload(album_id, image, f) finished.append(f) write_list(done_file, finished) time.sleep(random.randint(1, 3)) else: print('Skip %s' % image) except KeyboardInterrupt, e: print("User interrupt, quit.") raise except Exception, e: print("Error:%s On uploading :%s" % (e, image)) traceback.print_exc() error_count += 1 if error_count > 5: break time.sleep(error_count * 10)
def make_data(files, noise_types, data_root): # noise types training and val merged = zip(files, noise_types) np.random.shuffle(merged) training = ['{} {}'.format(f, t) for f, t in merged[:8000]] test = ['{} {}'.format(f, t) for f, t in merged[8000:]] write_list(training, osp.join(data_root, 'ntype_train.txt')) write_list(test, osp.join(data_root, 'ntype_test.txt')) # noise types of mixed training images dic = defaultdict(lambda: -1) dic.update(dict(zip(files, noise_types))) files = read_list(osp.join(data_root, 'mixed_train_images.txt')) files = [f.split()[0] for f in files] noise_types = [dic[f] for f in files] write_list(noise_types, osp.join(data_root, 'mixed_train_label_ntype.txt'))
def main(args): q = generate_matrix_q(args.level) write_matrix(q, osp.join(args.data_root, 'matrix_q.txt')) pickle(q, osp.join(args.data_root, 'matrix_q.pkl')) files, labels = parse(osp.join(args.data_root, 'train.txt')) noisy_labels = corrupt(labels, q) write_file_label_list(files[:10000], labels[:10000], osp.join(args.data_root, 'clean_train.txt')) write_file_label_list(files[:10000], noisy_labels[:10000], osp.join(args.data_root, 'noisy_train.txt')) noisy_as_clean_labels = labels[:10000] + noisy_labels[10000:] noisy_as_none_labels = labels[:10000] + [-1] * 40000 clean_as_none_labels = [-1] * 10000 + noisy_labels[10000:] merged = zip(files, noisy_as_clean_labels, noisy_as_none_labels, clean_as_none_labels) np.random.shuffle(merged) files, nacl, nanl, canl = zip(*merged) write_file_label_list(files, nacl, osp.join(args.data_root, 'mixed_train.txt')) write_list([f + ' -1' for f in files], osp.join(args.data_root, 'mixed_train_images.txt')) write_list(nanl, osp.join(args.data_root, 'mixed_train_label_clean.txt')) write_list(canl, osp.join(args.data_root, 'mixed_train_label_noisy.txt'))
import os import utils # collect bold and header terms sites = ['gamepedia', 'LeagueFandom', 'mobafire'] for site in sites: terms = list() folder = os.path.join("data", site + "Data") files = os.listdir(folder) boldFiles = [b for b in files if "boldTerms" in b] headerFiles = [h for h in files if "headerTerms" in h] fileLists = [boldFiles, headerFiles] termTypeNames = ["bold", "header"] for j, filelist in enumerate(fileLists): for idx, filename in enumerate(filelist): #clear_output(wait=True) print("processing", site, ":", termTypeNames[j], ": {} of {}".format(idx, len(filelist) - 1)) try: terms = terms + load_list(os.path.join(folder, filename)) except: e = sys.exc_info() print(e) print(os.path.join(folder, filename)) utils.write_list( terms, os.path.join("data", site + "_" + termTypeNames[j] + "Terms.txt")) print("done!")
suffix=file + " " * (80 - len(file)), length=50) text = "" with open(file, 'r', encoding='utf-8', errors='ignore') as f: text = f.read() # remove the first line, which is the source url and add it to urls list url = re.match(".*\n", text)[0] urls.append(url.strip()) # remove 1 endline character text = re.sub(".*\n", "", text, count=1) text = text.lower() # remove skin titles for s in removeText: text = text.replace(s.lower(), '') # add a space character to front and back so that full word search works for first and last text = " " + text + " " for jdx, t in enumerate(terms): # only count full word matches ( count = text.count(f" {t} ") counts[idx][jdx] = count # write the counts with open(countFilename, 'a+', encoding='utf-8', errors='ignore') as f: f.write(urls[idx]) f.write('\t') for c in counts[idx]: f.write(str(int(c)) + "\t") f.write("\n") write_list(urls, "data/urls.txt") print("done!")
for t in termsToReplace: w = w.replace(t[0], t[1]) w = re.sub('\(.*\)', '', w) terms.add(w.lower().strip()) fandomTermsList = list(terms) fandomTermsList.sort() # Gamepedia file = "data/gamepedia_CompletedPagesList.txt" pages = utils.load_list(file) terms = set() termsToReplace = [('_', ' '), ('%27', '\''), ('(item)', ''), ('%26', '&')] for p in pages: if "Teamfight_Tactics" in p: continue words = p.split('/') for w in words: for t in termsToReplace: w = w.replace(t[0], t[1]) w = re.sub('\(.*\)', '', w) terms.add(w.lower().strip()) gamepediaTermsList = list(terms) gamepediaTermsList.sort() terms = set.union(set(mobafireTermsList), set(fandomTermsList), set(gamepediaTermsList)) termsList = list(terms) termsList.sort() utils.write_list(termsList, "data/termsList.txt")
np.savetxt(train_file, imgs_train, delimiter=' ', fmt='%s') np.savetxt(test_file, np.concatenate(imgs_test_l, axis=0), delimiter=' ', fmt='%s') if __name__ == '__main__': # prob = [1.35, 0.7, 1.35] prob = [1.5, 0., 1.5] # train_file = HOME + '/prj/few-shot/data/imglst/img10k.train.txt' # test_file = HOME + '/prj/few-shot/data/imglst/img10k.test.txt' # prefix = HOME + '/prj/few-shot/data/imagenet-raw' train_file = HOME + '/prj/few-shot/data/imglst/img10k.train.no1k.txt' test_file = HOME + '/prj/few-shot/data/imglst/img10k.test.no1k.txt' prefix = HOME + '/prj/few-shot/data/imagenet-raw' # train_file = HOME + '/prj/few-shot/data/imglst/img10k.train.disk.txt' # test_file = HOME + '/prj/few-shot/data/imglst/img10k.test.disk.txt' # prefix = HOME + '/prj/few-shot/data/imagenet-raw' # train_file2 = HOME + '/prj/few-shot/data/imglst/img10k.train.redis.txt' # test_file2 = HOME + '/prj/few-shot/data/imglst/img10k.test.redis.txt' # prefix2 = '/mnt/nfs1703/kchen/imagenet-raw-trans-to-redis' names, nimgs = cls_sample(num, prob) # utils.write_list('/home/wangxinglu/prj/few-shot/data/imagenet10k.txt.chk', names, delimiter=' ', fmt='%s') utils.write_list('/home/wangxinglu/prj/few-shot/data/imagenet10k.no1k.chk', names, delimiter=' ', fmt='%s') gen_imglst(names, prefix, train_file, test_file) # gen_imglst(names, prefix2, train_file2, test_file2)
def write_file_label_list(files, labels, file_path): content = ['{} {}'.format(f, l) for f, l in zip(files, labels)] write_list(content, file_path)
def _save(file_label_list, file_path): content = ['{} {}'.format(x, y) for x, y in file_label_list] write_list(content, file_path)
from config import API_KEY, API_SECRET, USERNAME, PASSWORD if __name__ == '__main__': print(sys.argv) if len(sys.argv) < 3: print('Usage: python %s album_id dir' % sys.argv[0]) exit(1) api = ApiClient(key=API_KEY, secret=API_SECRET) print(api.login(USERNAME, PASSWORD)) album = sys.argv[1] directory = sys.argv[2] files = os.listdir(directory) finished = read_list('%s.txt' % album) error_count = 0 for f in files: image = os.path.join(directory, f) try: if f not in finished: print('Uploading %s' % image) api.photo_upload(album, image, f) finished.append(image) time.sleep(2) except Exception, e: print("error:%s on uploading :%s" % (e, image)) error_count += 1 if error_count > 5: break time.sleep(error_count * 10) write_list('%s.txt' % album, finished)
page = requests.get(site) soup = BeautifulSoup(page.content, 'lxml') text = re.split("\s+", soup.get_text()) for t in text: if "http" in t: if t not in visitedPages: if ".xml" in t: sitemap.append(t) else: pageList.append(t) remove = [ 'build', 'teamfight-tactics', 'stream', 'blog', 'video', 'tier-list', '/wiki/mobafire', 'forum', 'news', 'toplist', 'tournaments' ] refined = [ p.replace("https://www.mobafire.com/league-of-legends/", "") for p in pageList if "league-of-legends" in p ] for r in remove: refined = [item for item in refined if r not in item] builds = [u + "/stats" for u in refined if "champion/" in u] for b in builds: refined.append(b) refined.sort() utils.write_list(refined, "data/mobafire.txt")
resp = requests.get(baseurl + p) strainer = SoupStrainer(class_=["tabheader-tab", 'mw-body-content']) soup = BeautifulSoup(resp.content, 'lxml', parse_only=strainer) l = list() soup.find('div', id="top-schedule").decompose() # find all "a" tags, then grab only the ones that have a link (i.e. 'href') for t in soup.find_all("a"): if 'href' in t.attrs and "http" not in t['href'] and t['href'][ 0] != "#" and "action=edit" not in t['href']: link = t['href'] if link[0] == '/': link = link[1:] l.append(link) # add to the list of links for a in l: pageList.append(a) time.sleep(1) pageList = list(set(pageList)) pageList.sort() remove = [ "Patch", "Portal:", "2018", "File:", "Bjergsen", "TSM", "Echo_Fox", "#", "Template:" ] for r in remove: pageList = [p for p in pageList if r not in p] utils.write_list(pageList, "data/gamepedia.txt") print("finished")
def predict(input_dir, output_dir, embeddings, vocabulary, scores=False): dirname = os.path.dirname(os.path.realpath(__file__)) print(dirname) blocks_path = os.path.join(input_dir, "blocks.pkl") records_path = os.path.join(input_dir, "records.tf") write_pred_score = (scores is True) print("Defining estimator...") rconf = tf.estimator.RunConfig( save_checkpoints_steps=config.SAVE_CHECKPOINT_STEPS, save_checkpoints_secs=None, model_dir=os.path.join(dirname, config.MODEL_HOME)) params = { "padding_value": vocabulary["<PAD>"], "wembs": embeddings, "vocab": vocabulary, "coherence_hinge_margin": 1, "learning_rate": 0.0001 } estimator = tf.estimator.Estimator(model_fn=model.model_fn, config=rconf, params=params) print("Loading serialized raw test set texts...") test_texts = pickle.load(open(blocks_path, "rb")) print("Loaded.") res = estimator.predict(input_fn=lambda: get_data.get_data( records_path, is_train=False, epochs=1)) print("Documents to segment: " + str(len(test_texts[0]))) flat_blocks = [] for x in test_texts[0]: print(len(x[1])) flat_blocks.extend(x[1]) print("Number of prediction blocks: " + str(len(flat_blocks))) print( "Predicting with the model (this may take a while, depending on the number of documents)..." ) res_list = list(res) print("Predictions completed.") thold = 0.3 if config.MODEL_TYPE == "cats" else 0.5 glob_cntr = 0 docs = test_texts[0] agg_docs = [] for i in range(len(docs)): fname = docs[i][0] if i % 1000 == 1: print(fname) print(str(i) + " of " + str(len(docs)) + " documents...") blocks = docs[i][1] preds_blocks = res_list[glob_cntr:glob_cntr + len(blocks)] glob_cntr += len(blocks) sent_scores = [(b[0][0], b[0][1], []) for b in blocks] for b_ind in range(len(blocks)): for relb_ind in range(len(blocks[b_ind])): if blocks[b_ind][relb_ind][0] == config.fake_sent: break else: sent_ind = b_ind + relb_ind score = preds_blocks[b_ind][relb_ind][1] sent_scores[sent_ind][2].append(score) agg_sent_scores = [(x[0], x[1], np.mean(x[2]), (1 if np.mean(x[2]) >= thold else 0)) for x in sent_scores] agg_docs.append(agg_sent_scores) # printing out predictions docnames = [x[0] for x in docs] print("Storing segmented texts...") docscores = zip(docnames, agg_docs) for name, sentscores in docscores: print("Document: " + name) lines = [] for s in sentscores: if s[2] >= thold: lines.append(config.seg_start) lines.append(s[0] + "\t" + str(s[2]) if write_pred_score else s[0]) utils.write_list( os.path.join(output_dir, name.split("/")[-1] + ".seg"), lines) print("Stored.")
def fetch_all(total): list_file = 'jiusong_all.txt' page_file = 'jiusong_p_%s.txt' all_urls = [] for i in range(1, total): try: urls = fetch_page(i) if urls: all_urls.extend(urls) write_list(page_file % i, urls) time.sleep(random.randint(1, 2)) except KeyboardInterrupt, e: print("User interrupt, quit.") raise except Exception, e: print("Error:%s On fetch page %s" % (e, i)) traceback.print_exc() time.sleep(10) finally: write_list(list_file, all_urls) def main(): fetch_all(405) if __name__ == '__main__': main()
files = os.listdir(folder) boldFiles = [b for b in files if "boldTerms" in b] headerFiles = [h for h in files if "headerTerms" in h] fileLists = [boldFiles, headerFiles] # termTypeNames defined above for j, filelist in enumerate(fileLists): for idx, filename in enumerate(filelist): #clear_output(wait=True) print("processing", site,":",termTypeNames[j],": {} of {}".format(idx,len(filelist)-1) ) try: terms = terms + load_list(os.path.join(folder,filename)) except: e = sys.exc_info() print(e) print(os.path.join(folder,filename)) write_list(terms, os.path.join("data",site+"_"+termTypeNames[j]+"Terms.txt")) print("done!") # load all the terms print("loading terms from preprocessed files...") sites = ['gamepedia','LeagueFandom','mobafire'] files = ['_CompletedPagesList.txt'] if args.bold: files.append('_boldTerms.txt') if args.header: files.append('_headerTerms.txt') terms = list() # for each site, for each set of terms # filter them according to the function above
def _make(token): keys = read_list(os.path.join(data_root, "clean_{}_key_list.txt".format(token))) lines = [k + " " + label_kv[k] for k in keys] np.random.shuffle(lines) output_file = os.path.join(output_dir, "clean_{}.txt".format(token)) write_list(lines, output_file)
output = photos done_file = os.path.join(output, '%s_done.txt' % album_id) finished = read_list(done_file) error_count = 0 for f in files: image = os.path.join(output, f) _, ext = os.path.splitext(f) if not ext or ext.lower() not in ['.jpg', '.png', '.gif']: # print('Invalid %s' % image) continue try: if f not in finished: print('Uploading %s' % image) api.photo_upload(album_id, image, f) finished.append(f) write_list(done_file, finished) time.sleep(random.randint(1, 3)) else: print('Skip %s' % image) except KeyboardInterrupt, e: print("User interrupt, quit.") raise except Exception, e: print("Error:%s On uploading :%s" % (e, image)) traceback.print_exc() error_count += 1 if error_count > 5: break time.sleep(error_count * 10) write_list(done_file, finished)
] # Go through remove list removeList = list() for r in badList: print(r) resp = requests.get(r) strainer = SoupStrainer(class_=['category-page__member']) soup = BeautifulSoup(resp.content,'lxml',parse_only=strainer) l = list() #soup.find('div', id="top-schedule").decompose() for t in soup.find_all("a"): if 'href' in t.attrs and "http" not in t['href'] and "#" not in t['href'] and "action=edit" not in t['href']: link = t['href'] # remove '/wiki/' from beginning of link link = link.replace("/wiki/","") l.append(link) for a in l: removeList.append(a) time.sleep(1) removeList = list(set(removeList)) baseList = utils.load_list("data/LeagueFandom.txt") # Go through every item in removeList and remove it from baseList for w in removeList: baseList = [b for b in baseList if b!=w] utils.write_list(baseList,'data/LeagueFandom.txt')
def metrics(self, rankings, train_labels, test_labels, sorted_prods): ### Calculates classification and products set and position mAP ### ###------------------------------------------------------------------### print("Starting to calculate metrics ...") start = time.time() rel_ranks = [] for i in range(len(rankings)): rel_ranks.append( utils.relevance_ranking( rankings[i], train_labels, test_labels[i] ) ) # Classification mAP #----------------------------------------------------------------------- class_ap = [utils.class_ap(rel_rk) for rel_rk in rel_ranks] class_ap_filename = "results/class_avg_precs_{0}.txt".format( self.n_classes ) utils.write_list(class_ap, class_ap_filename) class_map = np.mean(class_ap) self.log += "ranking size = {0}".format(len(rankings[0])) + "\n" s = "classification mean average precision = {0}".format(class_map) self.log += s + "\n" print(s) # Dot products average precision #----------------------------------------------------------------------- # Set set_prec = [] for i in range(len(rankings)): indices = [prods[0] for prods in sorted_prods[i]] precision = utils.prod_set_prec(indices, rankings[i]) set_prec.append(precision) set_ap_filename = "results/set_avg_precs_{0}.txt".format( self.n_classes ) utils.write_list(set_prec, set_ap_filename) set_map = np.mean(set_prec) s = "set mean average precision = {0}".format(set_map) self.log += s + "\n" print(s) # Position pos_prec = [] for i in range(len(rankings)): indices = [prods[0] for prods in sorted_prods[i]] precision = utils.prod_pos_prec(indices, rankings[i]) pos_prec.append(precision) pos_ap_filename = "results/pos_avg_precs_{0}.txt".format( self.n_classes ) utils.write_list(pos_prec, pos_ap_filename) pos_map = np.mean(pos_prec) s = "position mean average precision = {0}".format(pos_map) self.log += s + "\n" print(s) end = time.time() elapsed_time = utils.humanize_time(end - start) s = "Elapsed time calculating metrics: {0}".format(elapsed_time) self.log += s + "\n" print (s)