Ejemplo n.º 1
0
def test_knn():
	dataset = pickle.load(open("dataset.obj", "rb"))
	n_classes = len(dataset.get_classes())
	start = time.time()
	predictions = knn.knn(dataset)
	end = time.time()
	elapsed_time = utils.humanize_time(end - start)
	print("Elapsed time using knn {0}...".format(elapsed_time))
	print("predictions = \n{0}".format(predictions))
	utils.write_list(predictions, "results/knn-predictions.txt")
	# predictions = [
	# 	[1, 1, 0, 2, 4, 3, 2, 0, 2, 4, 0, 3, 2, 1, 1],
	# 	[1, 2, 4, 2, 1, 0, 4, 1, 3, 2, 2, 2, 1, 2, 1],
	# 	[2, 3, 4, 2, 2, 0, 2, 0, 3, 3, 1, 2, 2, 2, 3],
 	#	[0, 1, 3, 3, 3, 3, 1, 3, 3, 3, 2, 2, 3, 0, 1],
 	# 	[3, 0, 2, 1, 4, 2, 1, 0, 2, 4, 1, 1, 4, 2, 3]
 	# ]
	hist = np.zeros((n_classes, n_classes), dtype=np.uint16)
	for i in range(len(predictions)):
		for j in range(len(predictions[i])):
			c = predictions[i][j]
			hist[i][c] += 1
	print("hist = \n{0}".format(hist))
	np.savetxt("results/knn-hist.csv", hist, fmt="%i", delimiter=",")
	confusion_matrix = hist / 25.0
	print("conf mat = \n{0}".format(confusion_matrix))
	values = [confusion_matrix[i][i] for i in range(n_classes)]
	precision = np.average(values)
	print("precision = {0}".format(precision))

	plt.matshow(confusion_matrix)
	plt.title('Confusion matrix')
	plt.colorbar()
	plt.show()
Ejemplo n.º 2
0
 def _make(token):
     keys = read_list(
         os.path.join(data_root, 'clean_{}_key_list.txt'.format(token)))
     lines = [k + ' ' + label_kv[k] for k in keys]
     np.random.shuffle(lines)
     output_file = os.path.join(output_dir, 'clean_{}.txt'.format(token))
     write_list(lines, output_file)
Ejemplo n.º 3
0
def test_knn():
    dataset = pickle.load(open("dataset.obj", "rb"))
    n_classes = len(dataset.get_classes())
    start = time.time()
    predictions = knn.knn(dataset)
    end = time.time()
    elapsed_time = utils.humanize_time(end - start)
    print("Elapsed time using knn {0}...".format(elapsed_time))
    print("predictions = \n{0}".format(predictions))
    utils.write_list(predictions, "results/knn-predictions.txt")
    # predictions = [
    # 	[1, 1, 0, 2, 4, 3, 2, 0, 2, 4, 0, 3, 2, 1, 1],
    # 	[1, 2, 4, 2, 1, 0, 4, 1, 3, 2, 2, 2, 1, 2, 1],
    # 	[2, 3, 4, 2, 2, 0, 2, 0, 3, 3, 1, 2, 2, 2, 3],
    #	[0, 1, 3, 3, 3, 3, 1, 3, 3, 3, 2, 2, 3, 0, 1],
    # 	[3, 0, 2, 1, 4, 2, 1, 0, 2, 4, 1, 1, 4, 2, 3]
    # ]
    hist = np.zeros((n_classes, n_classes), dtype=np.uint16)
    for i in range(len(predictions)):
        for j in range(len(predictions[i])):
            c = predictions[i][j]
            hist[i][c] += 1
    print("hist = \n{0}".format(hist))
    np.savetxt("results/knn-hist.csv", hist, fmt="%i", delimiter=",")
    confusion_matrix = hist / 25.0
    print("conf mat = \n{0}".format(confusion_matrix))
    values = [confusion_matrix[i][i] for i in range(n_classes)]
    precision = np.average(values)
    print("precision = {0}".format(precision))

    plt.matshow(confusion_matrix)
    plt.title('Confusion matrix')
    plt.colorbar()
    plt.show()
Ejemplo n.º 4
0
def process(user_id, all=False):
    print('process user_id=%s, all=%s' % (user_id, all))
    medias = get_medias(user_id, all)
    for m in medias:
        d = to_dict(m)
        pprint.pprint(d)
    if all:
        print('medias total: %s' % len(medias))
        urls = [m.get_standard_resolution_url() for m in medias]
        write_list(u'%s_list.txt' % user_id, urls)
        download_insta_files(get_medias(user_id, all), output=user_id)
Ejemplo n.º 5
0
def process(user_id, all=False):
    print('process user_id=%s, all=%s' % (user_id, all))
    medias = get_medias(user_id, all)
    for m in medias:
        d = to_dict(m)
        pprint.pprint(d)
    if all:
        print('medias total: %s' % len(medias))
        urls = [m.get_standard_resolution_url() for m in medias]
        write_list(u'%s_list.txt' % user_id, urls)
        download_insta_files(get_medias(user_id, all), output=user_id)
Ejemplo n.º 6
0
def make_data(data, labels, output_dir, prefix):
    image_dir = prefix + '_images/'
    mkdir_if_missing(osp.join(output_dir, image_dir))
    file_label_list = []
    num = len(data)
    for i in xrange(num):
        img = np.rollaxis(data[i, :].reshape((3, 32, 32)), 0, 3)
        filename = '{:05d}.jpg'.format(i)
        imsave(osp.join(output_dir, image_dir, filename), img)
        file_label_list.append('{} {}'.format(osp.join(image_dir, filename),
                                              int(labels[i])))
    write_list(file_label_list, osp.join(output_dir, prefix + '.txt'))
def make_data(data, labels, output_dir, prefix):
    image_dir = prefix + '_images/'
    mkdir_if_missing(osp.join(output_dir, image_dir))
    file_label_list = []
    num = len(data)
    for i in xrange(num):
        img = np.rollaxis(data[i, :].reshape((3, 32, 32)), 0, 3)
        filename = '{:05d}.jpg'.format(i)
        imsave(osp.join(output_dir, image_dir, filename), img)
        file_label_list.append('{} {}'.format(
            osp.join(image_dir, filename), int(labels[i])))
    write_list(file_label_list, osp.join(output_dir, prefix + '.txt'))
Ejemplo n.º 8
0
 def store_labels(self, train_labels, test_labels):
     ## Store the labels in a text file
     print("Storing the labels in text files...")
     start = time.time()
     train_labels_fn = "results/train_labels_{0}.txt".format(self.n_classes)
     test_labels_fn = "results/test_labels_{0}.txt".format(self.n_classes)
     utils.write_list(train_labels, train_labels_fn)
     utils.write_list(test_labels, test_labels_fn)
     end = time.time()
     s = "Elapsed time storing the labels {0} secs.".format(end - start)
     self.log += s + "\n"
     print(s)
Ejemplo n.º 9
0
 def store_labels(self, train_labels, test_labels):
     ## Store the labels in a text file
     print("Storing the labels in text files...")
     start = time.time()
     train_labels_fn = "results/train_labels_{0}.txt".format(self.n_classes)
     test_labels_fn = "results/test_labels_{0}.txt".format(self.n_classes)
     utils.write_list(train_labels, train_labels_fn)
     utils.write_list(test_labels, test_labels_fn)
     end = time.time()
     s = "Elapsed time storing the labels {0} secs.".format(end - start)
     self.log += s + "\n"
     print(s)
Ejemplo n.º 10
0
def make_aux_mixed(data_root, output_dir, upsample_ratio=1.0):
    ntype_kv = dict(zip(*read_kv(os.path.join(output_dir, 'ntype_train.txt'))))
    clean_kv = dict(
        zip(*read_kv(os.path.join(data_root, 'clean_label_kv.txt'))))
    noisy_kv = dict(
        zip(*read_kv(os.path.join(data_root, 'noisy_label_kv.txt'))))
    clean_keys = read_list(os.path.join(data_root, 'clean_train_key_list.txt'))
    noisy_keys = read_list(os.path.join(data_root, 'noisy_train_key_list.txt'))
    # upsampling clean keys to ratio * #noisy_keys
    clean_keys = np.random.choice(clean_keys, len(noisy_keys) * upsample_ratio)
    # mix clean and noisy data
    keys = list(clean_keys) + list(noisy_keys)
    np.random.shuffle(keys)
    clean, noisy, ntype = [], [], []
    for k in keys:
        if k in clean_kv:
            clean.append(clean_kv[k])
            noisy.append('-1')
        else:
            clean.append('-1')
            noisy.append(noisy_kv[k])
        if k in ntype_kv:
            ntype.append(ntype_kv[k])
        else:
            ntype.append('-1')
    keys = [k + ' -1' for k in keys]
    write_list(keys, os.path.join(output_dir, 'mixed_train_images.txt'))
    write_list(clean, os.path.join(output_dir, 'mixed_train_label_clean.txt'))
    write_list(noisy, os.path.join(output_dir, 'mixed_train_label_noisy.txt'))
    write_list(ntype, os.path.join(output_dir, 'mixed_train_label_ntype.txt'))
Ejemplo n.º 11
0
def make_aux_mixed(data_root, output_dir, upsample_ratio=1.0):
    ntype_kv = dict(zip(*read_kv(os.path.join(output_dir, "ntype_train.txt"))))
    clean_kv = dict(zip(*read_kv(os.path.join(data_root, "clean_label_kv.txt"))))
    noisy_kv = dict(zip(*read_kv(os.path.join(data_root, "noisy_label_kv.txt"))))
    clean_keys = read_list(os.path.join(data_root, "clean_train_key_list.txt"))
    noisy_keys = read_list(os.path.join(data_root, "noisy_train_key_list.txt"))
    # upsampling clean keys to ratio * #noisy_keys
    clean_keys = np.random.choice(clean_keys, len(noisy_keys) * upsample_ratio)
    # mix clean and noisy data
    keys = list(clean_keys) + list(noisy_keys)
    np.random.shuffle(keys)
    clean, noisy, ntype = [], [], []
    for k in keys:
        if k in clean_kv:
            clean.append(clean_kv[k])
            noisy.append("-1")
        else:
            clean.append("-1")
            noisy.append(noisy_kv[k])
        if k in ntype_kv:
            ntype.append(ntype_kv[k])
        else:
            ntype.append("-1")
    keys = [k + " -1" for k in keys]
    write_list(keys, os.path.join(output_dir, "mixed_train_images.txt"))
    write_list(clean, os.path.join(output_dir, "mixed_train_label_clean.txt"))
    write_list(noisy, os.path.join(output_dir, "mixed_train_label_noisy.txt"))
    write_list(ntype, os.path.join(output_dir, "mixed_train_label_ntype.txt"))
Ejemplo n.º 12
0
 def _make(keys, token):
     clean_labels = np.asarray([int(clean_kv[k]) for k in keys])
     noisy_labels = np.asarray([int(noisy_kv[k]) for k in keys])
     lines = []
     alpha = 1.0 / (C.shape[0] - 1)
     for key, y, y_tilde in zip(keys, clean_labels, noisy_labels):
         if y == y_tilde:
             lines.append(key + " 0")
         elif alpha >= C[y_tilde][y]:
             lines.append(key + " 1")
         else:
             lines.append(key + " 2")
     np.random.shuffle(lines)
     output_file = os.path.join(output_dir, "ntype_{}.txt".format(token))
     write_list(lines, output_file)
Ejemplo n.º 13
0
 def _make(keys, token):
     clean_labels = np.asarray([int(clean_kv[k]) for k in keys])
     noisy_labels = np.asarray([int(noisy_kv[k]) for k in keys])
     lines = []
     alpha = 1.0 / (C.shape[0] - 1)
     for key, y, y_tilde in zip(keys, clean_labels, noisy_labels):
         if y == y_tilde:
             lines.append(key + ' 0')
         elif alpha >= C[y_tilde][y]:
             lines.append(key + ' 1')
         else:
             lines.append(key + ' 2')
     np.random.shuffle(lines)
     output_file = os.path.join(output_dir, 'ntype_{}.txt'.format(token))
     write_list(lines, output_file)
Ejemplo n.º 14
0
def fetch_all(total):
    list_file = 'jiusong_all.txt'
    page_file = 'jiusong_p_%s.txt'
    all_urls = []
    for i in range(1, total):
        try:
            urls = fetch_page(i)
            if urls:
                all_urls.extend(urls)
                write_list(page_file % i, urls)
            time.sleep(random.randint(1, 2))
        except KeyboardInterrupt, e:
            print("User interrupt, quit.")
            raise
        except Exception, e:
            print("Error:%s On fetch page %s" % (e, i))
            traceback.print_exc()
            time.sleep(10)
Ejemplo n.º 15
0
def main(args):
    size = args.size
    q = generate_matrix_q(args.level)
    write_matrix(
        q, osp.join(args.data_root, 'matrix_q' + repr(args.level) + '.txt'))
    pickle(q, osp.join(args.data_root, 'matrix_q' + repr(args.level) + '.pkl'))
    files, labels = parse(osp.join(args.data_root, 'train.txt'))
    noisy_labels = corrupt(labels, q)
    write_file_label_list(files[:size], labels[:size],
                          osp.join(args.data_root, 'clean.txt'))
    write_file_label_list(
        files[:size], noisy_labels[:size],
        osp.join(args.data_root, 'noisy_' + repr(args.level) + '.txt'))
    write_list(
        noisy_labels[:size],
        osp.join(args.data_root, 'labels_noisy_' + repr(args.level) + '.txt'))
    write_list([f + ' -1' for f in files[:size]],
               osp.join(args.data_root, 'images.txt'))
Ejemplo n.º 16
0
def main(args):
    id_offset = 0
    merged_train_kv = {}
    merged_val_kv = {}
    for dataset_dir, db_dir in zip(args.dataset_dirs, args.db_dirs):
        train_files, train_labels = read_kv(osp.join(db_dir, 'train.txt'))
        val_files, val_labels = read_kv(osp.join(db_dir, 'val.txt'))
        unique_ids = set(map(int, train_labels + val_labels))
        id_mapping = {idx: i + id_offset for i, idx in enumerate(unique_ids)}
        for k, v in zip(train_files, train_labels):
            merged_train_kv[osp.join(dataset_dir, k)] = id_mapping[int(v)]
        for k, v in zip(val_files, val_labels):
            merged_val_kv[osp.join(dataset_dir, k)] = id_mapping[int(v)]
        id_offset += len(id_mapping)
    mkdir_if_missing(osp.join(args.output_dir))
    train_list = [k + ' ' + str(v) for k, v in merged_train_kv.iteritems()]
    np.random.shuffle(train_list)
    write_list(train_list, osp.join(args.output_dir, 'train.txt'))
    write_kv(merged_val_kv.keys(), map(str, merged_val_kv.values()),
             osp.join(args.output_dir, 'val.txt'))
    print "Max ID:", id_offset
Ejemplo n.º 17
0
    def metrics(self, rankings, train_labels, test_labels, sorted_prods):
        ###    Calculates classification and products set and position mAP   ###
        ###------------------------------------------------------------------###

        print("Starting to calculate metrics ...")
        start = time.time()
        rel_ranks = []
        for i in range(len(rankings)):
            rel_ranks.append(
                utils.relevance_ranking(rankings[i], train_labels,
                                        test_labels[i]))

        # Classification mAP
        #-----------------------------------------------------------------------
        class_ap = [utils.class_ap(rel_rk) for rel_rk in rel_ranks]
        class_ap_filename = "results/class_avg_precs_{0}.txt".format(
            self.n_classes)
        utils.write_list(class_ap, class_ap_filename)
        class_map = np.mean(class_ap)
        self.log += "ranking size = {0}".format(len(rankings[0])) + "\n"
        s = "classification mean average precision = {0}".format(class_map)
        self.log += s + "\n"
        print(s)

        # Dot products average precision
        #-----------------------------------------------------------------------
        # Set
        set_prec = []
        for i in range(len(rankings)):
            indices = [prods[0] for prods in sorted_prods[i]]
            precision = utils.prod_set_prec(indices, rankings[i])
            set_prec.append(precision)
        set_ap_filename = "results/set_avg_precs_{0}.txt".format(
            self.n_classes)
        utils.write_list(set_prec, set_ap_filename)
        set_map = np.mean(set_prec)
        s = "set mean average precision = {0}".format(set_map)
        self.log += s + "\n"
        print(s)

        # Position
        pos_prec = []
        for i in range(len(rankings)):
            indices = [prods[0] for prods in sorted_prods[i]]
            precision = utils.prod_pos_prec(indices, rankings[i])
            pos_prec.append(precision)
        pos_ap_filename = "results/pos_avg_precs_{0}.txt".format(
            self.n_classes)
        utils.write_list(pos_prec, pos_ap_filename)
        pos_map = np.mean(pos_prec)
        s = "position mean average precision = {0}".format(pos_map)
        self.log += s + "\n"
        print(s)

        end = time.time()
        elapsed_time = utils.humanize_time(end - start)
        s = "Elapsed time calculating metrics: {0}".format(elapsed_time)
        self.log += s + "\n"
        print(s)
Ejemplo n.º 18
0
def upload_photos_to_album(album_id, photos):
    print('Upload photos to album %s' % album_id)
    if os.path.isfile(photos):
        files = [os.path.basename(photos)]
        output = os.path.dirname(photos)
    else:
        files = os.listdir(photos)
        output = photos
    done_file = os.path.join(output, '%s_done.txt' % album_id)
    finished = read_list(done_file)
    error_count = 0
    for f in files:
        image = os.path.join(output, f)
        _, ext = os.path.splitext(f)
        if not ext or ext.lower() not in ['.jpg', '.png', '.gif']:
            # print('Invalid %s' % image)
            continue
        try:
            if f not in finished:
                print('Uploading %s' % image)
                api.photo_upload(album_id, image, f)
                finished.append(f)
                write_list(done_file, finished)
                time.sleep(random.randint(1, 3))
            else:
                print('Skip %s' % image)
        except KeyboardInterrupt, e:
            print("User interrupt, quit.")
            raise
        except Exception, e:
            print("Error:%s On uploading :%s" % (e, image))
            traceback.print_exc()
            error_count += 1
            if error_count > 5:
                break
            time.sleep(error_count * 10)
def make_data(files, noise_types, data_root):
    # noise types training and val
    merged = zip(files, noise_types)
    np.random.shuffle(merged)
    training = ['{} {}'.format(f, t) for f, t in merged[:8000]]
    test = ['{} {}'.format(f, t) for f, t in merged[8000:]]
    write_list(training, osp.join(data_root, 'ntype_train.txt'))
    write_list(test, osp.join(data_root, 'ntype_test.txt'))
    # noise types of mixed training images
    dic = defaultdict(lambda: -1)
    dic.update(dict(zip(files, noise_types)))
    files = read_list(osp.join(data_root, 'mixed_train_images.txt'))
    files = [f.split()[0] for f in files]
    noise_types = [dic[f] for f in files]
    write_list(noise_types, osp.join(data_root, 'mixed_train_label_ntype.txt'))
Ejemplo n.º 20
0
def make_data(files, noise_types, data_root):
    # noise types training and val
    merged = zip(files, noise_types)
    np.random.shuffle(merged)
    training = ['{} {}'.format(f, t) for f, t in merged[:8000]]
    test = ['{} {}'.format(f, t) for f, t in merged[8000:]]
    write_list(training, osp.join(data_root, 'ntype_train.txt'))
    write_list(test, osp.join(data_root, 'ntype_test.txt'))
    # noise types of mixed training images
    dic = defaultdict(lambda: -1)
    dic.update(dict(zip(files, noise_types)))
    files = read_list(osp.join(data_root, 'mixed_train_images.txt'))
    files = [f.split()[0] for f in files]
    noise_types = [dic[f] for f in files]
    write_list(noise_types, osp.join(data_root, 'mixed_train_label_ntype.txt'))
def main(args):
    q = generate_matrix_q(args.level)
    write_matrix(q, osp.join(args.data_root, 'matrix_q.txt'))
    pickle(q, osp.join(args.data_root, 'matrix_q.pkl'))
    files, labels = parse(osp.join(args.data_root, 'train.txt'))
    noisy_labels = corrupt(labels, q)
    write_file_label_list(files[:10000], labels[:10000],
        osp.join(args.data_root, 'clean_train.txt'))
    write_file_label_list(files[:10000], noisy_labels[:10000],
        osp.join(args.data_root, 'noisy_train.txt'))

    noisy_as_clean_labels = labels[:10000] + noisy_labels[10000:]
    noisy_as_none_labels = labels[:10000] + [-1] * 40000
    clean_as_none_labels = [-1] * 10000 + noisy_labels[10000:]
    merged = zip(files, noisy_as_clean_labels, noisy_as_none_labels, clean_as_none_labels)
    np.random.shuffle(merged)
    files, nacl, nanl, canl = zip(*merged)
    write_file_label_list(files, nacl,
                          osp.join(args.data_root, 'mixed_train.txt'))
    write_list([f + ' -1' for f in files],
               osp.join(args.data_root, 'mixed_train_images.txt'))
    write_list(nanl, osp.join(args.data_root, 'mixed_train_label_clean.txt'))
    write_list(canl, osp.join(args.data_root, 'mixed_train_label_noisy.txt'))
import os
import utils

# collect bold and header terms
sites = ['gamepedia', 'LeagueFandom', 'mobafire']

for site in sites:
    terms = list()
    folder = os.path.join("data", site + "Data")
    files = os.listdir(folder)
    boldFiles = [b for b in files if "boldTerms" in b]
    headerFiles = [h for h in files if "headerTerms" in h]
    fileLists = [boldFiles, headerFiles]
    termTypeNames = ["bold", "header"]
    for j, filelist in enumerate(fileLists):
        for idx, filename in enumerate(filelist):
            #clear_output(wait=True)
            print("processing", site, ":", termTypeNames[j],
                  ": {} of {}".format(idx,
                                      len(filelist) - 1))
            try:
                terms = terms + load_list(os.path.join(folder, filename))
            except:
                e = sys.exc_info()
                print(e)
                print(os.path.join(folder, filename))
        utils.write_list(
            terms,
            os.path.join("data", site + "_" + termTypeNames[j] + "Terms.txt"))
print("done!")
Ejemplo n.º 23
0
                     suffix=file + " " * (80 - len(file)),
                     length=50)

    text = ""
    with open(file, 'r', encoding='utf-8', errors='ignore') as f:
        text = f.read()
        # remove the first line, which is the source url and add it to urls list
        url = re.match(".*\n", text)[0]
        urls.append(url.strip())
        # remove 1 endline character
        text = re.sub(".*\n", "", text, count=1)
        text = text.lower()
        # remove skin titles
        for s in removeText:
            text = text.replace(s.lower(), '')
        # add a space character to front and back so that full word search works for first and last
        text = " " + text + " "
        for jdx, t in enumerate(terms):
            # only count full word matches (
            count = text.count(f" {t} ")
            counts[idx][jdx] = count
    # write the counts
    with open(countFilename, 'a+', encoding='utf-8', errors='ignore') as f:
        f.write(urls[idx])
        f.write('\t')
        for c in counts[idx]:
            f.write(str(int(c)) + "\t")
        f.write("\n")

write_list(urls, "data/urls.txt")
print("done!")
Ejemplo n.º 24
0
        for t in termsToReplace:
            w = w.replace(t[0], t[1])
        w = re.sub('\(.*\)', '', w)
        terms.add(w.lower().strip())
fandomTermsList = list(terms)
fandomTermsList.sort()

# Gamepedia
file = "data/gamepedia_CompletedPagesList.txt"
pages = utils.load_list(file)
terms = set()
termsToReplace = [('_', ' '), ('%27', '\''), ('(item)', ''), ('%26', '&')]
for p in pages:
    if "Teamfight_Tactics" in p:
        continue
    words = p.split('/')
    for w in words:
        for t in termsToReplace:
            w = w.replace(t[0], t[1])
        w = re.sub('\(.*\)', '', w)
        terms.add(w.lower().strip())
gamepediaTermsList = list(terms)
gamepediaTermsList.sort()

terms = set.union(set(mobafireTermsList), set(fandomTermsList),
                  set(gamepediaTermsList))
termsList = list(terms)
termsList.sort()
utils.write_list(termsList, "data/termsList.txt")
Ejemplo n.º 25
0
    np.savetxt(train_file, imgs_train, delimiter=' ', fmt='%s')
    np.savetxt(test_file, np.concatenate(imgs_test_l, axis=0), delimiter=' ', fmt='%s')


if __name__ == '__main__':
    # prob = [1.35, 0.7, 1.35]
    prob = [1.5, 0., 1.5]

    # train_file = HOME + '/prj/few-shot/data/imglst/img10k.train.txt'
    # test_file = HOME + '/prj/few-shot/data/imglst/img10k.test.txt'
    # prefix = HOME + '/prj/few-shot/data/imagenet-raw'
    train_file = HOME + '/prj/few-shot/data/imglst/img10k.train.no1k.txt'
    test_file = HOME + '/prj/few-shot/data/imglst/img10k.test.no1k.txt'
    prefix = HOME + '/prj/few-shot/data/imagenet-raw'

    # train_file = HOME + '/prj/few-shot/data/imglst/img10k.train.disk.txt'
    # test_file = HOME + '/prj/few-shot/data/imglst/img10k.test.disk.txt'
    # prefix = HOME + '/prj/few-shot/data/imagenet-raw'

    # train_file2 = HOME + '/prj/few-shot/data/imglst/img10k.train.redis.txt'
    # test_file2 = HOME + '/prj/few-shot/data/imglst/img10k.test.redis.txt'
    # prefix2 = '/mnt/nfs1703/kchen/imagenet-raw-trans-to-redis'


    names, nimgs = cls_sample(num, prob)
    # utils.write_list('/home/wangxinglu/prj/few-shot/data/imagenet10k.txt.chk', names, delimiter=' ', fmt='%s')
    utils.write_list('/home/wangxinglu/prj/few-shot/data/imagenet10k.no1k.chk', names, delimiter=' ', fmt='%s')

    gen_imglst(names, prefix, train_file, test_file)
    # gen_imglst(names, prefix2, train_file2, test_file2)
def write_file_label_list(files, labels, file_path):
    content = ['{} {}'.format(f, l) for f, l in zip(files, labels)]
    write_list(content, file_path)
Ejemplo n.º 27
0
def _save(file_label_list, file_path):
    content = ['{} {}'.format(x, y) for x, y in file_label_list]
    write_list(content, file_path)
Ejemplo n.º 28
0
def write_file_label_list(files, labels, file_path):
    content = ['{} {}'.format(f, l) for f, l in zip(files, labels)]
    write_list(content, file_path)
Ejemplo n.º 29
0
from config import API_KEY, API_SECRET, USERNAME, PASSWORD

if __name__ == '__main__':
    print(sys.argv)
    if len(sys.argv) < 3:
        print('Usage: python %s album_id dir' % sys.argv[0])
        exit(1)
    api = ApiClient(key=API_KEY, secret=API_SECRET)
    print(api.login(USERNAME, PASSWORD))
    album = sys.argv[1]
    directory = sys.argv[2]
    files = os.listdir(directory)
    finished = read_list('%s.txt' % album)
    error_count = 0
    for f in files:
        image = os.path.join(directory, f)
        try:
            if f not in finished:
                print('Uploading %s' % image)
                api.photo_upload(album, image, f)
                finished.append(image)
                time.sleep(2)
        except Exception, e:
            print("error:%s on uploading :%s" % (e, image))
            error_count += 1
            if error_count > 5:
                break
            time.sleep(error_count * 10)
    write_list('%s.txt' % album, finished)
Ejemplo n.º 30
0
        page = requests.get(site)
        soup = BeautifulSoup(page.content, 'lxml')
        text = re.split("\s+", soup.get_text())
        for t in text:
            if "http" in t:
                if t not in visitedPages:
                    if ".xml" in t:
                        sitemap.append(t)
                    else:
                        pageList.append(t)

remove = [
    'build', 'teamfight-tactics', 'stream', 'blog', 'video', 'tier-list',
    '/wiki/mobafire', 'forum', 'news', 'toplist', 'tournaments'
]

refined = [
    p.replace("https://www.mobafire.com/league-of-legends/", "")
    for p in pageList if "league-of-legends" in p
]
for r in remove:
    refined = [item for item in refined if r not in item]

builds = [u + "/stats" for u in refined if "champion/" in u]
for b in builds:
    refined.append(b)

refined.sort()

utils.write_list(refined, "data/mobafire.txt")
Ejemplo n.º 31
0
    resp = requests.get(baseurl + p)
    strainer = SoupStrainer(class_=["tabheader-tab", 'mw-body-content'])
    soup = BeautifulSoup(resp.content, 'lxml', parse_only=strainer)
    l = list()
    soup.find('div', id="top-schedule").decompose()
    # find all "a" tags, then grab only the ones that have a link (i.e. 'href')
    for t in soup.find_all("a"):
        if 'href' in t.attrs and "http" not in t['href'] and t['href'][
                0] != "#" and "action=edit" not in t['href']:
            link = t['href']
            if link[0] == '/':
                link = link[1:]
            l.append(link)
    # add to the list of links
    for a in l:
        pageList.append(a)
    time.sleep(1)

pageList = list(set(pageList))
pageList.sort()
remove = [
    "Patch", "Portal:", "2018", "File:", "Bjergsen", "TSM", "Echo_Fox", "#",
    "Template:"
]
for r in remove:
    pageList = [p for p in pageList if r not in p]

utils.write_list(pageList, "data/gamepedia.txt")

print("finished")
Ejemplo n.º 32
0
def predict(input_dir, output_dir, embeddings, vocabulary, scores=False):
    dirname = os.path.dirname(os.path.realpath(__file__))
    print(dirname)

    blocks_path = os.path.join(input_dir, "blocks.pkl")
    records_path = os.path.join(input_dir, "records.tf")
    write_pred_score = (scores is True)

    print("Defining estimator...")
    rconf = tf.estimator.RunConfig(
        save_checkpoints_steps=config.SAVE_CHECKPOINT_STEPS,
        save_checkpoints_secs=None,
        model_dir=os.path.join(dirname, config.MODEL_HOME))

    params = {
        "padding_value": vocabulary["<PAD>"],
        "wembs": embeddings,
        "vocab": vocabulary,
        "coherence_hinge_margin": 1,
        "learning_rate": 0.0001
    }
    estimator = tf.estimator.Estimator(model_fn=model.model_fn,
                                       config=rconf,
                                       params=params)

    print("Loading serialized raw test set texts...")
    test_texts = pickle.load(open(blocks_path, "rb"))
    print("Loaded.")

    res = estimator.predict(input_fn=lambda: get_data.get_data(
        records_path, is_train=False, epochs=1))

    print("Documents to segment: " + str(len(test_texts[0])))
    flat_blocks = []
    for x in test_texts[0]:
        print(len(x[1]))
        flat_blocks.extend(x[1])

    print("Number of prediction blocks: " + str(len(flat_blocks)))

    print(
        "Predicting with the model (this may take a while, depending on the number of documents)..."
    )
    res_list = list(res)
    print("Predictions completed.")

    thold = 0.3 if config.MODEL_TYPE == "cats" else 0.5

    glob_cntr = 0
    docs = test_texts[0]

    agg_docs = []

    for i in range(len(docs)):
        fname = docs[i][0]
        if i % 1000 == 1:
            print(fname)
            print(str(i) + " of " + str(len(docs)) + " documents...")
        blocks = docs[i][1]
        preds_blocks = res_list[glob_cntr:glob_cntr + len(blocks)]
        glob_cntr += len(blocks)

        sent_scores = [(b[0][0], b[0][1], []) for b in blocks]
        for b_ind in range(len(blocks)):
            for relb_ind in range(len(blocks[b_ind])):
                if blocks[b_ind][relb_ind][0] == config.fake_sent:
                    break
                else:
                    sent_ind = b_ind + relb_ind
                    score = preds_blocks[b_ind][relb_ind][1]
                    sent_scores[sent_ind][2].append(score)
        agg_sent_scores = [(x[0], x[1], np.mean(x[2]),
                            (1 if np.mean(x[2]) >= thold else 0))
                           for x in sent_scores]
        agg_docs.append(agg_sent_scores)

    # printing out predictions
    docnames = [x[0] for x in docs]
    print("Storing segmented texts...")
    docscores = zip(docnames, agg_docs)
    for name, sentscores in docscores:
        print("Document: " + name)
        lines = []
        for s in sentscores:
            if s[2] >= thold:
                lines.append(config.seg_start)
            lines.append(s[0] + "\t" + str(s[2]) if write_pred_score else s[0])
        utils.write_list(
            os.path.join(output_dir,
                         name.split("/")[-1] + ".seg"), lines)
    print("Stored.")
Ejemplo n.º 33
0

def fetch_all(total):
    list_file = 'jiusong_all.txt'
    page_file = 'jiusong_p_%s.txt'
    all_urls = []
    for i in range(1, total):
        try:
            urls = fetch_page(i)
            if urls:
                all_urls.extend(urls)
                write_list(page_file % i, urls)
            time.sleep(random.randint(1, 2))
        except KeyboardInterrupt, e:
            print("User interrupt, quit.")
            raise
        except Exception, e:
            print("Error:%s On fetch page %s" % (e, i))
            traceback.print_exc()
            time.sleep(10)
        finally:
            write_list(list_file, all_urls)


def main():
    fetch_all(405)


if __name__ == '__main__':
    main()
Ejemplo n.º 34
0
		files = os.listdir(folder)
		boldFiles = [b for b in files if "boldTerms" in b]
		headerFiles = [h for h in files if "headerTerms" in h]
		fileLists = [boldFiles, headerFiles]
		# termTypeNames defined above
		for j, filelist in enumerate(fileLists):
			for idx, filename in enumerate(filelist):
				#clear_output(wait=True)
				print("processing", site,":",termTypeNames[j],": {} of {}".format(idx,len(filelist)-1) )
				try:
					terms = terms + load_list(os.path.join(folder,filename))
				except:
					e = sys.exc_info()
					print(e)
					print(os.path.join(folder,filename))                    
			write_list(terms, os.path.join("data",site+"_"+termTypeNames[j]+"Terms.txt"))
	print("done!")


# load all the terms
print("loading terms from preprocessed files...")
sites = ['gamepedia','LeagueFandom','mobafire']
files = ['_CompletedPagesList.txt']
if args.bold:
	files.append('_boldTerms.txt')
if args.header:
	files.append('_headerTerms.txt')
terms = list()

# for each site, for each set of terms
# filter them according to the function above
Ejemplo n.º 35
0
 def _make(token):
     keys = read_list(os.path.join(data_root, "clean_{}_key_list.txt".format(token)))
     lines = [k + " " + label_kv[k] for k in keys]
     np.random.shuffle(lines)
     output_file = os.path.join(output_dir, "clean_{}.txt".format(token))
     write_list(lines, output_file)
Ejemplo n.º 36
0
        output = photos
    done_file = os.path.join(output, '%s_done.txt' % album_id)
    finished = read_list(done_file)
    error_count = 0
    for f in files:
        image = os.path.join(output, f)
        _, ext = os.path.splitext(f)
        if not ext or ext.lower() not in ['.jpg', '.png', '.gif']:
            # print('Invalid %s' % image)
            continue
        try:
            if f not in finished:
                print('Uploading %s' % image)
                api.photo_upload(album_id, image, f)
                finished.append(f)
                write_list(done_file, finished)
                time.sleep(random.randint(1, 3))
            else:
                print('Skip %s' % image)
        except KeyboardInterrupt, e:
            print("User interrupt, quit.")
            raise
        except Exception, e:
            print("Error:%s On uploading :%s" % (e, image))
            traceback.print_exc()
            error_count += 1
            if error_count > 5:
                break
            time.sleep(error_count * 10)
    write_list(done_file, finished)
Ejemplo n.º 37
0
            ]

# Go through remove list
removeList = list()
for r in badList:
    print(r)
    resp = requests.get(r)
    strainer = SoupStrainer(class_=['category-page__member'])
    soup = BeautifulSoup(resp.content,'lxml',parse_only=strainer)
    l = list()
    #soup.find('div', id="top-schedule").decompose()
    for t in soup.find_all("a"):
        if 'href' in t.attrs and "http" not in t['href'] and "#" not in t['href'] and "action=edit" not in t['href']:
            link = t['href']
            # remove '/wiki/' from beginning of link
            link = link.replace("/wiki/","")
            l.append(link)

    for a in l:
        removeList.append(a)
    time.sleep(1)

removeList = list(set(removeList))
baseList = utils.load_list("data/LeagueFandom.txt")

# Go through every item in removeList and remove it from baseList
for w in removeList:
    baseList = [b for b in baseList if b!=w]

utils.write_list(baseList,'data/LeagueFandom.txt')
Ejemplo n.º 38
0
    def metrics(self, rankings, train_labels, test_labels, sorted_prods):
        ###    Calculates classification and products set and position mAP   ###
        ###------------------------------------------------------------------###
        
        print("Starting to calculate metrics ...")
        start = time.time()
        rel_ranks = []
        for i in range(len(rankings)):
            rel_ranks.append(
                utils.relevance_ranking(
                    rankings[i], train_labels, test_labels[i]
                )
            )

        # Classification mAP
        #-----------------------------------------------------------------------
        class_ap = [utils.class_ap(rel_rk) for rel_rk in rel_ranks]
        class_ap_filename = "results/class_avg_precs_{0}.txt".format(
            self.n_classes
        )
        utils.write_list(class_ap, class_ap_filename)
        class_map = np.mean(class_ap)
        self.log += "ranking size = {0}".format(len(rankings[0])) + "\n"
        s = "classification mean average precision = {0}".format(class_map)
        self.log += s + "\n"
        print(s)

        # Dot products average precision
        #-----------------------------------------------------------------------
        # Set
        set_prec = []
        for i in range(len(rankings)):
            indices = [prods[0] for prods in sorted_prods[i]]
            precision = utils.prod_set_prec(indices, rankings[i])
            set_prec.append(precision)
        set_ap_filename = "results/set_avg_precs_{0}.txt".format(
            self.n_classes
        )
        utils.write_list(set_prec, set_ap_filename)
        set_map = np.mean(set_prec)
        s = "set mean average precision = {0}".format(set_map)
        self.log += s + "\n"
        print(s)

        # Position
        pos_prec = []
        for i in range(len(rankings)):
            indices = [prods[0] for prods in sorted_prods[i]]
            precision = utils.prod_pos_prec(indices, rankings[i])
            pos_prec.append(precision)
        pos_ap_filename = "results/pos_avg_precs_{0}.txt".format(
            self.n_classes
        )
        utils.write_list(pos_prec, pos_ap_filename)
        pos_map = np.mean(pos_prec)
        s = "position mean average precision = {0}".format(pos_map)
        self.log += s + "\n"
        print(s)


        end = time.time()
        elapsed_time = utils.humanize_time(end - start)
        s = "Elapsed time calculating metrics: {0}".format(elapsed_time)
        self.log += s + "\n"
        print (s)