Beispiel #1
0
def build_word_count():
    if os.path.isfile('pickled/wcount.pickle'):
        return read_pickle('pickled/wcount.pickle')
    wcount = Counter()
    for fid in words.fileids():
        for word in words.words(fid):
            word = word.lower()
            if only_words.match(word) is not None:
                wcount[word] += 1
    for fid in gutenberg.fileids():
        for word in gutenberg.words(fid):
            word = word.lower()
            if only_words.match(word) is not None:
                wcount[word] += 1
    for fid in brown.fileids():
        for word in brown.words(fid):
            word = word.lower()
            if only_words.match(word) is not None:
                wcount[word] += 1
    for fid in reuters.fileids():
        for word in reuters.words(fid):
            word = word.lower()
            if only_words.match(word) is not None:
                wcount[word] += 1
    for fid in inaugural.fileids():
        for word in inaugural.words(fid):
            word = word.lower()
            if only_words.match(word) is not None:
                wcount[word] += 1
    dump_pickle(wcount, 'pickled/wcount.pickle')
    return wcount
Beispiel #2
0
def get_link_df():
    if not os.path.exists(wsmall_latest_linkdf):
        names = ['wsmall_file', 'href']
        ldf = pd.read_csv(wsmall_latest_links, sep=',', names=names)
        dump_pickle(ldf, wsmall_latest_linkdf)
        return ldf
    else:
        return read_pickle(wsmall_latest_linkdf)
Beispiel #3
0
def check_webarchive(ldf):
    if os.path.exists(wsmall_latest_webarchive):
        archive_sites = []
        for archive_site in archiving_sites:
            archive_sites.append(ldf[ldf.href.str.startswith(archive_site)])
        about_wba = pd.concat(archive_sites)
        dump_pickle(about_wba, wsmall_latest_webarchive)
        return about_wba
    else:
        return read_pickle(wsmall_latest_webarchive)
def save_result_file(seq_name, pred_dir, tracks):
    seq_filepaths = glob.glob(f'{paths.DATA_ROOT}/3dpw/sequenceFiles/*/*.pkl')
    seq_path = next(p for p in seq_filepaths
                    if os.path.basename(p) == f'{seq_name}.pkl')
    rel_path = '/'.join(util.split_path(seq_path)[-2:])
    out_path = f'{pred_dir}/{rel_path}'
    n_frames = len(
        glob.glob(f'{paths.DATA_ROOT}/3dpw/imageFiles/{seq_name}/image_*.jpg'))
    coords3d_raw = np.array(
        [complete_track(track, n_frames) for track in tracks]) / 1000
    util.dump_pickle(dict(jointPositions=coords3d_raw), out_path)
Beispiel #5
0
def get_users2(ureviews, mset):
    if not os.path.exists(usr_pickle2):
        def user_trans(lines):
            return map(lambda line: UserWReviews(split=usr_split.split(line), ureviews=ureviews, mset=mset), lines)

        with SelectFromFile(user_file, transformer=user_trans, selector=lambda x: list(x)) as it:
            usrs = it
        dump_pickle(usrs, usr_pickle2)
    else:
        usrs = read_pickle(usr_pickle2)
    return usrs
Beispiel #6
0
def get_movies():
    if not os.path.exists(movie_pickle):
        def move_clean_split(line):
            return Movie(msplit.split(msanity.sub('|', line.rstrip())))

        movie_map = {}
        with codecs.open(movie_f, 'r', encoding='utf-8', errors='replace') as movs:
            for mov in map(move_clean_split, movs):
                movie_map[mov.mid] = mov
        dump_pickle(movie_map, movie_pickle)
    else:
        movie_map = read_pickle(movie_pickle)
    return movie_map
Beispiel #7
0
def domain_info(ldf=None):
    if not os.path.exists(wsmall_latest_no_wlink):
        no_wlinks = ldf[ldf.href.str.contains(
            r'(www)|(http)')][~ldf.href.str.contains('wiki')]
        no_wlinks['href'] = no_wlinks['href'].apply(front_slash_nuke)
        dump_pickle(no_wlinks, wsmall_latest_no_wlink)
        if not os.path.exists(wsmall_latest_no_wlinkd):
            no_wlinks['domain'] = no_wlinks.href.map(domain_getter)
            dump_pickle(no_wlinks, wsmall_latest_no_wlinkd)
    else:
        no_wlinks = read_pickle(wsmall_latest_no_wlinkd)

    return no_wlinks
Beispiel #8
0
def get_users(reviews=None):
    if not os.path.exists(usr_pickle):
        def user_trans(lines):
            return map(lambda line: User(usr_split.split(line), reviews), lines)

        usrs = {}
        with SelectFromFile(user_file, transformer=user_trans, selector=lambda x: list(x)) as it:
            for u in it:
                usrs[u.id] = u
        dump_pickle(usrs, usr_pickle)
    else:
        usrs = read_pickle(usr_pickle)
    return usrs
Beispiel #9
0
def check_ar_outlinks(ars):
    if not os.path.exists(wsmall_statues_ar):
        result = {}
        temp = []
        processed = 0
        c = 0
        with FuturesSession(
                session=requests.Session(),
                executor=ProcessPoolExecutor(max_workers=10)) as session:
            for href in ars.href:
                temp.append(href)
                if len(temp) >= 100:
                    pending = []
                    for url in temp:
                        result[url] = -1
                        pending.append(
                            session.head(url,
                                         headers={'User-Agent': useragents[c]},
                                         timeout=5.0))
                        c += 1
                        if c == 3:
                            c = 0
                    for future in pending:
                        try:
                            response = future.result()
                            url = response.url
                            scode = response.status_code
                            result[url] = scode
                        except Exception as e:
                            one = 1
                        processed += 1
                        if processed % 100 == 0:
                            print(processed)
                    temp.clear()
        print('outise the with')
        had_status = {'archive': [], 'status': []}
        timed_out = {'archive': [], 'status': []}
        for k, v in result.items():
            ar = archives_map(k)
            if v == -1:
                timed_out['archive'].append(ar)
                timed_out['status'].append(v)
                continue
            had_status['archive'].append(ar)
            had_status['status'].append(v)
        hs = pd.DataFrame(had_status)
        to = pd.DataFrame(timed_out)
        dump_pickle((hs, to), wsmall_statues_ar)
        return hs, to
    else:
        return read_pickle(wsmall_statues_ar)
Beispiel #10
0
def generate_poseviz_gt(i_subject, activity_name, camera_id):
    camera_names = ['54138969', '55011271', '58860488', '60457274']
    camera_name = camera_names[camera_id]
    data, camera = get_examples(i_subject,
                                activity_name,
                                camera_id,
                                frame_step=1,
                                correct_S9=True)

    results = []
    examples = []
    for image_relpath, world_coords, bbox in data:
        results.append({
            'gt_poses': [world_coords.tolist()],
            'camera_intrinsics':
            camera.intrinsic_matrix.tolist(),
            'camera_extrinsics':
            camera.get_extrinsic_matrix().tolist(),
            'image_path':
            image_relpath,
            'bboxes': [bbox.tolist()]
        })
        ex = ps3d.Pose3DExample(image_relpath,
                                world_coords,
                                bbox,
                                camera,
                                activity_name=activity_name)
        examples.append(ex)

    joint_names = ('rhip,rkne,rank,lhip,lkne,lank,tors,neck,head,htop,'
                   'lsho,lelb,lwri,rsho,relb,rwri,pelv'.split(','))
    edges = ('htop-head-neck-lsho-lelb-lwri,neck-rsho-relb-rwri,'
             'neck-tors-pelv-lhip-lkne-lank,pelv-rhip-rkne-rank')
    joint_info = ps3d.JointInfo(joint_names, edges)
    ds = ps3d.Pose3DDataset(joint_info, test_examples=examples)
    util.dump_pickle(
        ds,
        f'{paths.DATA_ROOT}/h36m/poseviz/S{i_subject}_{activity_name}_{camera_name}.pkl'
    )

    output = {}
    output['joint_names'] = joint_info.names
    output['stick_figure_edges'] = joint_info.stick_figure_edges
    output['world_up'] = camera.world_up.tolist()
    output['frame_infos'] = results
    util.dump_json(
        output,
        f'{paths.DATA_ROOT}/h36m/poseviz/S{i_subject}_{activity_name}_{camera_name}.json'
    )
Beispiel #11
0
def get_movies2():
    if not os.path.exists(movie_pickle2):
        def move_clean_split(line):
            return Movie(msplit.split(msanity.sub('|', line.rstrip())))

        movies = []
        movie_idx = {}
        with codecs.open(movie_f, 'r', encoding='utf-8', errors='replace') as movs:
            for idx, mov in enumerate(map(move_clean_split, movs)):
                movies.append(mov)
                movie_idx[mov.mid] = idx
        dump_pickle((movies, movie_idx), movie_pickle2)
    else:
        movies, movie_idx = read_pickle(movie_pickle2)
    return movies, movie_idx
Beispiel #12
0
def get_reviews2():
    if not os.path.exists(usr_review_pickle2):
        def review_mapper(line):
            ur = UserReview(split=usr_ratting.split(line.rstrip()))
            return ur

        def trans(rvs):
            return seq(rvs).map(review_mapper).group_by(lambda ur: ur.uid).to_dict()

        with SelectFromFile(usr_review_file, transformer=trans, selector=lambda x: x) as r:
            reviews = r
        dump_pickle(reviews, usr_review_pickle2)
    else:
        reviews = read_pickle(usr_review_pickle2)

    return reviews
Beispiel #13
0
def check_wiki_live_web_links():
    if not os.path.exists(wsmall_latest_nwl_status):
        link_df = get_link_df()
        di = domain_info(link_df)
        result = {}
        temp = []
        num = re.compile(
            '^(?:(?:www\.)|(?:http://)|(?:https://))?(?:[0-9]{1,3}\.){3}.+$')
        c = 0
        processed = 0
        with FuturesSession(
                session=requests.Session(),
                executor=ProcessPoolExecutor(max_workers=10)) as session:
            for href in di.href.unique():
                if not href.startswith('http') and not href.startswith(
                        'https'):
                    href = '%s%s' % ('http://', href)
                if num.match(href) is not None:
                    result[href] = -2
                    continue
                temp.append(href)
                if len(temp) >= 1000:
                    pending = []
                    for url in temp:
                        result[url] = -1
                        pending.append(
                            session.head(url,
                                         headers={'User-Agent': useragents[c]},
                                         timeout=5.0))
                        c += 1
                        if c == 3:
                            c = 0
                    for future in pending:
                        try:
                            response = future.result()
                            url = response.url
                            scode = response.status_code
                            result[url] = scode
                        except Exception as e:
                            one = 1
                        processed += 1
                        if processed % 1000 == 0:
                            print(processed)
                    temp.clear()

        print('outise the with')
        dump_pickle(result, wsmall_latest_nwl_status)
Beispiel #14
0
def get_reviews(movie_map):
    if not os.path.exists(usr_review_pickle):
        def review_mapper(line):
            ur = URating(usr_ratting.split(line.rstrip()))
            ur.mname = movie_map.get(ur.itemid, None)
            return ur

        def trans(rvs):
            return seq(rvs).map(review_mapper).group_by(lambda ur: ur.uid).to_dict()

        with SelectFromFile(usr_review_file, transformer=trans, selector=lambda x: x) as r:
            reviews = r
        dump_pickle(reviews, usr_review_pickle)
    else:
        reviews = read_pickle(usr_review_pickle)

    return reviews
Beispiel #15
0
def tidy_uris(s_list):
    if not os.path.exists(tidied_uirs):
        t = []
        for w in s_list:
            original = os.path.basename(w)
            m = match_end.findall(original)
            if len(m) > 0:
                p_name = original.replace(m[0], '')
            else:
                p_name = original.replace('.html', '')
            t.append({
                'quoted': quote(p_name),
                'unquoted': p_name,
                'original': original
            })
        dump_pickle(t, tidied_uirs)
        return t
    else:
        return read_pickle(tidied_uirs)
Beispiel #16
0
def compute_old_wsmall_links():
    if not os.path.exists('pickled/wsmallo_outl_temp.pickle'):
        old_link_df = get_oldl_df()
        out = defaultdict(LinkDict)
        for owfile in old_link_df.wsmall_file.unique():
            owdf = old_link_df[old_link_df.wsmall_file == owfile]
            links_to = old_link_df[old_link_df.wsmall_file.isin(owdf.href)]
            links_to_other = owdf[~owdf.href.isin(links_to.href)]
            out[owfile]['outlink_wsmall'] += links_to.wsmall_file.unique().size
            out[owfile]['outlink_other'] += links_to_other.wsmall_file.unique(
            ).size
            out[owfile]['total_outlinks'] += out[owfile][
                'outlink_wsmall'] + out[owfile]['outlink_other']
            for other in links_to.wsmall_file.unique():
                out[other]['inlink'] += 1
        dump_pickle(out, 'pickled/wsmallo_outl_temp.pickle')
        return out
    else:
        return read_pickle('pickled/wsmallo_outl_temp.pickle')
Beispiel #17
0
def galago_postingd_csv():
    cline = './rungalago.sh dump-index index'
    with open('output_files/idx3.csv', 'w') as retOut:
        runner = Popen(shlex.split(cline), stdout=retOut, stderr=PIPE)
        print(runner.stderr.read())
    idx = Idx()
    with open('idx3.csv', 'r') as gal:
        for line in gal:
            lsplit = line.rstrip().lstrip().split(',')
            word = lsplit[0]
            doc = lsplit[1]
            at = lsplit[2:]
            idx[word].add_doc_where(doc, at)
    dump_pickle(idx, 'pickled/idx3.pickle')
    with open('output_files/idx3_terms.txt', 'w') as termOut:
        termOut.write(' '.join(
            sorted(
                filter(lambda x: isWord.match(x) is not None,
                       list(idx.keys())))))
Beispiel #18
0
def dump_window_idx():
    cline = './rungalago.sh dump-index windowIdx/od.n2.w5.h2'
    with open('output_files/window5/ordered5idx.txt', 'w') as retOut:
        runner = Popen(shlex.split(cline), stdout=retOut, stderr=PIPE)
        print(runner.stderr.read())
    ab_count = Counter()
    ab_count_wins = Counter()
    wins = 0
    with open('output_files/window5/ordered5idx.txt', 'r') as oin:
        for line in oin:
            splitted = line.rstrip().split(',')
            if only_words.match(splitted[0]) is not None:
                a, b = splitted[0].split('~')
                ab_count[a] += 1
                ab_count[b] += 1
                wins += 1
                ab_count_wins[a, b] += 1

    dump_pickle((ab_count, ab_count_wins, wins),
                'pickled/window5Counts.pickle')
Beispiel #19
0
def dl_pages(tidied_uris):
    if not os.path.exists(wsmall_statues):
        useragent = 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:44.0) Gecko/20100101 Firefox/44.01'
        statuses = {}
        with requests.session() as session:
            session.headers.update({'User-Agent': useragent})
            for tidy in tidied_uris:
                uri = base % tidy['unquoted']
                r = session.get(uri)
                scode = r.status_code
                print(uri, scode)
                statuses[tidy['original']] = scode
                if scode == 200:
                    with open('wiki_small_latest/%s' % tidy['original'],
                              'w') as out:
                        out.write(r.text)
        dump_pickle(statuses, wsmall_statues)
        return statuses
    else:
        return read_pickle(wsmall_statues)
Beispiel #20
0
def compute_wnew_link_stats(ldf):
    if not os.path.exists('pickled/wsmall_nlinkstats.pickle'):
        only_wiki = pd.concat([
            ldf[ldf.href.str.contains('/wiki')],
            ldf[ldf.href.str.contains('en.wikipedia.org/wiki')]
        ])
        only_wiki.href = only_wiki.href.apply(lambda h: os.path.basename(h))
        only_wiki.wsmall_file = only_wiki.wsmall_file.apply(
            clean_new_wiki_name)
        wn_lstats = defaultdict(LinkDict)
        for owfile in only_wiki.wsmall_file.unique():
            owdf = only_wiki[only_wiki.wsmall_file == owfile]
            links_to = only_wiki[only_wiki.wsmall_file.isin(owdf.href)]
            links_to_other = owdf[~owdf.href.isin(links_to.href)]
            wn_lstats[owfile]['outlink_wsmall'] += links_to.wsmall_file.unique(
            ).size
            wn_lstats[owfile][
                'outlink_other'] += links_to_other.wsmall_file.unique().size
            wn_lstats[owfile]['total_outlinks'] += wn_lstats[owfile][
                'outlink_wsmall'] + wn_lstats[owfile]['outlink_other']
            for other in links_to.wsmall_file.unique():
                wn_lstats[other]['inlink'] += 1
        it = {
            'wsmall': [],
            'outlink_wsmall': [],
            'outlink_other': [],
            'total_outlinks': [],
            'inlinks': []
        }
        for name in wn_lstats.keys():
            it['wsmall'].append(name)
            it['outlink_wsmall'].append(wn_lstats[name]['outlink_wsmall'])
            it['outlink_other'].append(wn_lstats[name]['outlink_other'])
            it['total_outlinks'].append(wn_lstats[name]['total_outlinks'])
            it['inlinks'].append(wn_lstats[name]['inlink'])
        lstats_df = pd.DataFrame(it)
        dump_pickle(lstats_df, 'pickled/wsmall_nlinkstats.pickle')
        return lstats_df
    else:
        return read_pickle('pickled/wsmall_nlinkstats.pickle')
Beispiel #21
0
def predict_rated(method='pearson_seq', k=10):
    want = review_file_rated_pickle % (method, k)
    if not os.path.exists(want):
        user_predictions = defaultdict(UserReviewedPred)
        review_df = get_review_df()
        reviewer_sims = get_reviewer_sim(review_df)
        movies = get_movies_df()
        users = review_df.user_id.sort_values().unique()
        for user in users:
            user_reviews = review_df[review_df.user_id == user]
            similar_to_user = reviewer_sims[reviewer_sims.user == user]
            knn = pick_knn(similar_to_user, method=method, k=k)
            user_mean = user_reviews.rating.mean()
            sum_user = 1 / knn.sim.sum()
            neighbors = review_df[review_df.user_id.isin(knn.other_user.unique())]
            reviewed_same = neighbors[neighbors.item_id.isin(user_reviews.item_id)]
            neighbor_reviewed_ids = reviewed_same.item_id.unique()
            for neighbor_review_id in neighbor_reviewed_ids:
                movie_reviewed = reviewed_same[reviewed_same.item_id == neighbor_review_id]
                the_movie = movies[movies.movie_id.isin(movie_reviewed.item_id)]
                movie_reviewed_title = the_movie.movie_title.iat[0]
                neighbor_reviews = neighbors[neighbors.item_id == neighbor_review_id]
                acum = []
                for _, row in neighbor_reviews.iterrows():
                    neighbor_who_did_id = row['user_id']
                    neighbor_who_did_rating = row['rating']
                    neighbor_who_did_mean_rating = review_df[review_df.user_id == neighbor_who_did_id].rating.mean()
                    neighbor_who_did_sim_to_cur_u = knn[knn.other_user == neighbor_who_did_id].sim.iat[0]
                    acum.append(
                        neighbor_who_did_sim_to_cur_u * (neighbor_who_did_rating - neighbor_who_did_mean_rating))
                sum_neighbors = sum(acum)
                predicted_score = user_mean + (sum_user * sum_neighbors)
                the_movie_id = the_movie.movie_id.iat[0]
                actual_r = user_reviews[user_reviews.item_id == the_movie_id].rating.iat[0]
                user_predictions[user].add_movie(the_movie_id, movie_reviewed_title, predicted_score, actual_r)
        dump_pickle(user_predictions, want)
        return user_predictions
    else:
        return read_pickle(want)
Beispiel #22
0
def save_overall_mask(people, i_out):
    def _get_mask(i):
        i_subject, i_seq, i_cam, i_frame = people[i]
        encoded_mask = get_mask(int(i_subject), int(i_seq), int(i_cam),
                                int(i_frame))
        return improc.decode_mask(encoded_mask)

    def _get_chair_mask(i):
        i_subject, i_seq, i_cam, i_frame = people[i]
        encoded_mask = get_chair_mask(int(i_subject), int(i_seq), int(i_cam),
                                      int(i_frame))
        return improc.decode_mask(encoded_mask)

    overall_mask = _get_mask(0)
    np.maximum(overall_mask, _get_chair_mask(0), out=overall_mask)

    for i in range(1, 4):
        np.maximum(overall_mask, _get_mask(i), out=overall_mask)
        np.maximum(overall_mask, _get_chair_mask(i), out=overall_mask)

    s = f'{i_out + 1:06d}'
    out_path = f'{paths.DATA_ROOT}/muco/masks/{s[:2]}/{s[:4]}/{s}.pkl'
    util.dump_pickle(improc.encode_mask(overall_mask), out_path)
Beispiel #23
0
def old_wsmall_lstats_df():
    if not os.path.exists('pickled/wsmallo_outl_stats.pickle'):
        old_wsmall_links = compute_old_wsmall_links()
        it = {
            'wsmall': [],
            'outlink_wsmall': [],
            'outlink_other': [],
            'total_outlinks': [],
            'inlinks': []
        }
        for name in old_wsmall_links.keys():
            it['wsmall'].append(name)
            it['outlink_wsmall'].append(
                old_wsmall_links[name]['outlink_wsmall'])
            it['outlink_other'].append(old_wsmall_links[name]['outlink_other'])
            it['total_outlinks'].append(
                old_wsmall_links[name]['total_outlinks'])
            it['inlinks'].append(old_wsmall_links[name]['inlink'])
        lstats_df = pd.DataFrame(it)
        dump_pickle(lstats_df, 'pickled/wsmallo_outl_stats.pickle')
    else:
        lstats_df = read_pickle('pickled/wsmallo_outl_stats.pickle')

    return lstats_df
Beispiel #24
0
                except:
                    pass
    return changed


max_iter = 5
for i in range(max_iter):
    changed = replace_roots_recursive()
    if not changed:
        print("Nothing changed after %d iterations." % i)
        break

# In[ ]:

# word_roots_dict = load_pickle("word_roots_dict.pkl")
str_lines = []
for line in list(word_roots_dict.items()):
    str_lines.append(line[0] + "\t" + ", ".join(line[1]))
write_lines("word_roots_dict.txt", str_lines)
dump_pickle("word_roots_dict.pkl", word_roots_dict)

# In[ ]:

all_roots = list(itertools.chain(*list(word_roots_dict.values())))
freq_dist = nltk.FreqDist(all_roots).most_common()
write_lines("root_freq_dist.txt", freq_dist)
dump_pickle("root_freq_dist.pkl", freq_dist)
# print(freq_dist)

# In[ ]:
Beispiel #25
0
file = "words.pkl"

fp = Path(file)
if fp.is_file():
    pages = load_pickle(file)
else:
    pdf_file = open('/playpen/home/tongn/Word Frequency List of American English.pdf', 'rb')
    read_pdf = PyPDF2.PdfFileReader(pdf_file)
    start = 1906
    end = 2053
    pages = []
    for i in range(start, end):
        page = read_pdf.getPage(i)
        page_content = page.extractText()
        pages.append(page_content)
    dump_pickle(file, pages)


# In[ ]:


entries = []
bad_pages = []
for (i, page) in enumerate(pages):
    if test_page >= 0:
        i = test_page
        page = pages[test_page]
    
    if i == 0:
        header_len = 50
    else:
Beispiel #26
0
def main():
    print('main function called')
    print(ARGS.output)
    if not os.path.exists(ARGS.output):
        print(ARGS.output + ' does not exist, create the folder...')
        os.makedirs(ARGS.output)

    if ARGS.weighted_training:
        svm_classifier = SVMBuilder.get_svm_pipeline(C=ARGS.C,
                                                     class_weight='balanced')
    else:
        svm_classifier = SVMBuilder.get_svm_pipeline(C=ARGS.C)
    print('SVM classifier pipeline created:')
    # print(svm_classifier.get_params())

    if ARGS.cross_validate:
        for cv_set in glob.glob(ARGS.data_dir + '/set*'):
            ix = int(cv_set[-1:])
            print(
                "--------------------------set {}----------------------------".
                format(ix))
            print("Read data from {}".format(cv_set))
            train = TsvReader.from_tsv(tsv_dir=cv_set + '/train.tsv')
            print('Model training...')
            svm_classifier.fit(train.features, train.labels)
            util.dump_pickle(
                svm_classifier,
                ARGS.output + '/cv_model_set_{}'.format(ix),
                log='SVM model classifier from cross validation set {}'.format(
                    ix),
                overwrite=True)
            print('\tTraining finished. Model saved to {}'.format(
                ARGS.output + '/cv_model_set_{}'.format(ix)))
            if ARGS.prediction:
                print('Predict on test set...')
                test = TsvReader.from_tsv(tsv_dir=cv_set + '/dev.tsv')
                if ARGS.predict_prob:
                    prediction = svm_classifier.predict_proba(test.features)
                    # pdb.set_trace()
                    np.savetxt(ARGS.output +
                               '/cv_test_pred_prob_set_{}.txt'.format(ix),
                               prediction,
                               delimiter='\t')
                    print('\tPrediction finished. Result saved to {}'.format(
                        ARGS.output +
                        '/cv_test_pred_prob_set_{}.txt'.format(ix)))
                else:
                    prediction = svm_classifier.predict(test.features)
                    with open(
                            ARGS.output +
                            '/cv_test_pred_set_{}.txt'.format(ix),
                            'w') as handle:
                        for pred in prediction:
                            handle.write('{}\n'.format(int(pred)))
                    print('\tPrediction finished. Result saved to {}'.format(
                        ARGS.output + '/cv_test_pred_set_{}.txt'.format(ix)))
    else:
        print("Import training data...")
        if ARGS.training_data is not None:
            train = TsvReader.from_tsv(tsv_dir=ARGS.training_data)
        else:
            train = TsvReader.from_tsv(tsv_dir=ARGS.data_dir + '/train.tsv')
        # pdb.set_trace()
        print('Model training...')
        svm_classifier.fit(train.features, train.labels)
        util.dump_pickle(svm_classifier,
                         ARGS.output + '/model_trained.pkl',
                         log='SVM model classifier from training set.',
                         overwrite=True)
        print('\tTraining finished. Model saved to {}'.format(
            ARGS.output + '/model_trained.pkl'))
        if ARGS.prediction:
            print('Predict on test set...')
            test = TsvReader.from_tsv(tsv_dir=ARGS.data_dir + '/test.tsv')
            # pdb.set_trace()
            if ARGS.predict_prob:
                prediction = svm_classifier.predict_proba(test.features)
                np.savetxt(ARGS.output + '/test_pred_prob.txt',
                           prediction,
                           delimiter='\t')
                print('\tPrediction finished. Result saved to {}'.format(
                    ARGS.output + '/test_pred_prob.txt'))
        else:
            prediction = svm_classifier.predict(test.features)
            with open(ARGS.output + '/test_pred.txt', 'w') as handle:
                for pred in prediction:
                    handle.write('{}\n'.format(int(pred)))
            print('\tPrediction finished. Result saved to {}'.format(
                ARGS.output + '/test_pred.txt'))
Beispiel #27
0
def make_efficient_example(ex):
    image_relpath = ex.image_path
    max_rotate = np.pi / 6
    padding_factor = 1 / 0.85
    scale_up_factor = 1 / 0.85
    scale_down_factor = 1 / 0.85
    shift_factor = 1.2
    base_dst_side = 256

    box_center = boxlib.center(ex.bbox)
    s, c = np.sin(max_rotate), np.cos(max_rotate)
    w, h = ex.bbox[2:]
    rot_bbox_side = max(c * w + s * h, c * h + s * w)
    rot_bbox = boxlib.box_around(box_center, rot_bbox_side)

    scale_factor = min(base_dst_side / np.max(ex.bbox[2:]) * scale_up_factor,
                       1)
    expansion_factor = padding_factor * shift_factor * scale_down_factor
    expanded_bbox = boxlib.expand(rot_bbox, expansion_factor)
    expanded_bbox = boxlib.intersect(expanded_bbox,
                                     np.array([0, 0, 2048, 2048]))

    new_camera = ex.camera.copy()
    new_camera.intrinsic_matrix[:2, 2] -= expanded_bbox[:2]
    new_camera.scale_output(scale_factor)
    new_camera.undistort()
    dst_shape = improc.rounded_int_tuple(scale_factor * expanded_bbox[[3, 2]])

    new_im_relpath = ex.image_path.replace('3dhp', f'3dhp_downscaled')
    new_im_path = os.path.join(paths.DATA_ROOT, new_im_relpath)
    if not (util.is_file_newer(new_im_path, "2019-11-14T23:32:07")
            and improc.is_image_readable(new_im_path)):
        im = improc.imread_jpeg(f'{paths.DATA_ROOT}/{image_relpath}')
        new_im = cameralib.reproject_image(im, ex.camera, new_camera,
                                           dst_shape)
        util.ensure_path_exists(new_im_path)
        imageio.imwrite(new_im_path, new_im)

    new_bbox_topleft = cameralib.reproject_image_points(
        ex.bbox[:2], ex.camera, new_camera)
    new_bbox = np.concatenate([new_bbox_topleft, ex.bbox[2:] * scale_factor])

    mask_rle_relpath = new_im_path.replace('Images', 'FGmaskImages').replace(
        '.jpg', '.pkl')
    mask_rle_path = os.path.join(paths.DATA_ROOT, mask_rle_relpath)
    if util.is_file_newer(mask_rle_path, "2020-03-11T20:46:46"):
        mask_runlength = util.load_pickle(mask_rle_path)
    else:
        mask_relpath = ex.image_path.replace('Images', 'FGmaskImages').replace(
            '.jpg', '.png')
        mask = imageio.imread(os.path.join(paths.DATA_ROOT, mask_relpath))
        mask_reproj = cameralib.reproject_image(mask, ex.camera, new_camera,
                                                dst_shape)
        mask_runlength = get_mask_with_highest_iou(mask_reproj, new_bbox)
        util.dump_pickle(mask_runlength, mask_rle_path)

    return p3ds.Pose3DExample(new_im_relpath,
                              ex.world_coords,
                              new_bbox,
                              new_camera,
                              mask=mask_runlength,
                              univ_coords=ex.univ_coords)
Beispiel #28
0
if os.path.exists(intermediate_file):
    new_vocab = load_pickle(intermediate_file)
    print(f"Loaded intermediate .pkl file with {len(new_vocab)} entries")
else:
    new_vocab = OrderedDict()

# In[6]:

for i, (word, lst) in tqdm(enumerate(list(vocab.items()))):
    if word in new_vocab:
        continue
    try:
        num_related = get_num_related(word)
    except:
        num_related = -1
    new_vocab[word] = [lst[0], str(num_related)] + lst[2:]

    if i % 25 == 0 and i > 0:  # store intermediate result every ? iterations
        dump_pickle("intermediate_importance.pkl", new_vocab)
        updated_entries = ['\t'.join([k] + v) for k, v in new_vocab.items()]
        write_lines('importance.txt', updated_entries)

updated_entries = ['\t'.join([k] + v) for k, v in new_vocab.items()]
write_lines('importance.txt', updated_entries)

driver.close()

# In[ ]:

# In[ ]:
Beispiel #29
0

# In[18]:

if __name__ == "__main__":
    # Parse arguments
    args = parse_args()
    strategy = args.strategy
    path = args.path
    bpe = args.bpe

    # Get vocab and text
    source_path = join(path, "raw")
    target_path = join(path, "indexed")

    prefix = "bpe_" if bpe else ""
    vocab_file = join(path, f"{prefix}vocab.txt")
    vocab_lines = read_lines(vocab_file)
    vocab = [line.split()[0] for line in vocab_lines]
    indexer = Indexer(vocab)

    splits = ["training", "valid", "test"]
    for split in splits:
        source_filename = f"{prefix}raw_{split}_text.{strategy}.txt"
        dialogues = read_lines(join(source_path, source_filename))
        indexed_dialogues = [
            indexer.index(dialogue) for dialogue in tqdm(dialogues)
        ]
        target_filename = f"{prefix}indexed_{split}_text.{strategy}.pkl"
        dump_pickle(join(target_path, target_filename), indexed_dialogues)