def perform_training_and_testing(training_stage, args, data):
    '''
    Returns
    -------
    The validation error. A quantity that we want to minimize.
    '''
    stats = None
    with rasengan.tictoc(training_stage):
        with rasengan.debug_support():
            if args.perform_training or args.perform_testing:
                with rasengan.tictoc("Circuit Compilation"):
                    ttns = get_train_test_namespace(args)
                with rasengan.tictoc("Loading Parameters"):
                    load_params_from_pklfile(ttns, args)
                pass
            rasengan.decrease_print_indent()
            print_pklfn_performance(args)
            rasengan.increase_print_indent()
            # Train
            if args.perform_training:
                with rasengan.tictoc("Training"):
                    stats = lstm_seqlabel_training.training(args, data, ttns)
            # Test (IF asked)
            if args.perform_testing:
                with rasengan.tictoc("Testing"):
                    stats = lstm_seqlabel_validation.testing(args, data, ttns)
                    return (100 - stats)
    if stats is None:
        return 100
    else:
        best_epoch_id = stats['best_epoch_id']
        return (100 - stats['validation_result'][best_epoch_id]['f1'])
Esempio n. 2
0
def main():
    with rasengan.tictoc("Loading Graph"):
        graph = igraph.read(args.graph_fn)
        graph.to_undirected(mode="collapse",
                            combine_edges=dict(weights="first"))

    with rasengan.tictoc("Creating Adjacent Node List"):
        adjacent_edge_list = graph.get_inclist()
        adjacent_node_list = graph.get_adjlist()

    # 1234 is just a random number.
    assert (adjacent_node_list[1234][0]
            in graph.es[adjacent_edge_list[1234][0]].tuple)

    total_vertices = float(len(adjacent_edge_list))
    with rasengan.tictoc("Creating Local Node Prob List"):
        edge_prob_list = []
        for idx, edges in enumerate(adjacent_edge_list):
            if idx % 1000 == 0:
                print idx / total_vertices * 100
            weights = np.array(graph.es[edges]["weight"])
            edge_prob_list.append(weights / weights.sum())

    queries = read_queries(args.query_fn)

    for qid, query in queries.iteritems():
        for start in query:
            weighted_random_walk(graph,
                                 start,
                                 adjacent_node_list,
                                 edge_prob_list,
                                 path_maxlength=args.path_maxlength,
                                 n_runs=args.n_runs)
def perform_training_and_testing(training_stage, args, data):
    '''
    Returns
    -------
    The validation error. A quantity that we want to minimize.
    '''
    stats = None
    with rasengan.tictoc(training_stage):
        with rasengan.debug_support():
            if args.perform_training or args.perform_testing:
                with rasengan.tictoc("Circuit Compilation"):
                    ttns = get_train_test_namespace(args)
                with rasengan.tictoc("Loading Parameters"):
                    load_params_from_pklfile(ttns, args)
                pass
            rasengan.decrease_print_indent()
            print_pklfn_performance(args)
            rasengan.increase_print_indent()
            # Train
            if args.perform_training:
                with rasengan.tictoc("Training"):
                    stats = lstm_seqlabel_training.training(args, data, ttns)
            # Test (IF asked)
            if args.perform_testing:
                with rasengan.tictoc("Testing"):
                    stats = lstm_seqlabel_validation.testing(args, data, ttns)
                    return (100 - stats)
    if stats is None:
        return 100
    else:
        best_epoch_id = stats['best_epoch_id']
        return (100 - stats['validation_result'][best_epoch_id]['f1'])
Esempio n. 4
0
def main():
    import argparse
    arg_parser = argparse.ArgumentParser(description='')
    arg_parser.add_argument('--seed', default=0, type=int, help='Default={0}')
    arg_parser.add_argument(
        '--ci_pkl_fn', default='data/dbpedia_cat_index.pkl', type=str,
        help='A map from categories to integer')
    arg_parser.add_argument(
        '--wdc_pkl_fn', default='data/wikilink_dbpedia_categories.pkl', type=str,
        help='A list from url to their categories')
    arg_parser.add_argument(
        '--out_pkl_fn', default='data/wikilink_category_to_url_and_count_reverse_index.pkl', type=str)
    arg_parser.add_argument(
        '--out_tsv_fn', default='data/wikilink_category_to_count.tsv', type=str)
    arg_parser.add_argument(
        '--admissible_url_fn', default='data/dbpedia_people.list', type=str)
    args = arg_parser.parse_args()
    with rasengan.tictoc('Loading pkl'):
        ci = pkl.load(open(args.ci_pkl_fn))
        url_to_cat_cnt = pkl.load(open(args.wdc_pkl_fn))
        admissible_url = set([e.strip() for e in open(args.admissible_url_fn)])
    with rasengan.tictoc('Creating WRI'):
        wri = WikilinkReverseIndex(ci, url_to_cat_cnt, admissible_url)
    with open(args.out_pkl_fn, 'wb') as f:
        pkl.dump(wri, f)
Esempio n. 5
0
def svd_1(a, debug=True, inplace=True):
    assert a.flags.c_contiguous
    if debug:
        print_config()
    # NOTE: scipy.linalg.blas.ssyrk(1, a, trans=1, lower=1)
    # causes an unnecessary copy, because a is c_contiguous.
    with tictoc('Computing b'):
        b = scipy.linalg.blas.ssyrk(1, a.T, trans=0, lower=1)
    if debug:
        print_config()
    with tictoc('Computing eigh'):
        [bs, bu] = scipy.linalg.eigh(b, turbo=True, overwrite_a=True,
                                     check_finite=True)
    with tictoc('Scale bu inplace'):
        for i in xrange(bu.shape[1]-1, -1, -1):
            scalar = (1/numpy.sqrt(bs[i])
                      if bs[i] > 1e-6
                      else 0)
            scipy.linalg.blas.sscal(scalar, bu, n=bu.shape[0], offx=i*bu.shape[0])
    if debug:
        print_config(msg='i=%d'%i)
    with tictoc('Inplace Matmul'):
        c = matrix_multiply_inplace.matmul(a, bu)
    del bu
    del bs
    if debug:
        print_config()
    return [c, i]
Esempio n. 6
0
 def call_impl(self, cat, train_idx, test_idx):
     with rasengan.tictoc('Fitting'):            # 2.1s
         self.fit(self.smat[train_idx], train_idx=train_idx)
     self.smat = self.smat.tocsr()
     with rasengan.tictoc('Prediction'):         # 20s
         scores = self.score(self.smat)
     self.pa(cat, scores, train_idx, test_idx, scratch=self.scratch)
     self.scratch = {}
Esempio n. 7
0
def main():
    global args
    args = populate_args()
    rnr = ExperimentRunner(
        datacfg=DATACONFIG,
        ppcfg=CONFIG[args.ppcfg],
        expcfg=EXPCONFIG[args.expcfg],)
    rnr()
    with rasengan.tictoc('Saving Results'):
        rnr.save_results(fn=args.out_pkl_fn)
    with rasengan.tictoc('Reporting'):
        rnr.report()
Esempio n. 8
0
    def __init__(self, datacfg, ppcfg, expcfg):
        # Init Part 0
        self.datacfg = datacfg
        self.ppcfg = ppcfg
        self.expcfg = expcfg

        with rasengan.tictoc('Init Part 1 : The Datacfg'):
            self.cp = DbfilenameShelf(
                r'%s/%s'%(uc.get_pfx(),self.datacfg.cp_fn),
                protocol=-1,
                flag='r')
            self.url_list = self.cp['__URL_LIST__']
            self.TM = self.cp['__TOKEN_MAPPER__']
            # self.TM.final must be patched to work with older
            # versions of TokenMapper that are in the pickle.
            if not hasattr(self.TM, 'final'):
                self.TM.final = False
            if self.is_malignull():
                self.TM([self.expcfg.NULL_KEY])
            self.bos_idx = self.TM.finalize()
            self.pa = Aggregator(
                datacfg=datacfg,
                ppcfg=ppcfg,
                expcfg=expcfg,
                url_list=self.url_list,
                TM=self.TM)
            self.cat_folds = pkl.load(uc.proj_open(self.datacfg.fold_fn))
            self.cat2url = uc.load_cat2url(uc.proj_open(self.datacfg.cat2url_fn))
            self.url_to_idx = dict((b,a) for a,b in enumerate(self.url_list))
            self.scratch = {}
            pass

        with rasengan.tictoc('Init Part 2 : The PP CFG'):
            print 'Reading', 'catpeople_pp_%d'%args.ppcfg
            self.smat = io.mmread(uc.proj_open('catpeople_pp_%d'%args.ppcfg))
            assert scipy.sparse.isspmatrix_coo(self.smat)
            if self.pp_prefix_is([UNIVEC, BIVEC, MALIGNER, DSCTOKVEC]):
                self.vectors = np.load(uc.proj_open('catpeople_pp_%d.vec'%args.ppcfg))
            pass

        if self.is_malignull():
            self.NULL_VEC = np.zeros((1,self.vectors.shape[1]))
        if self.exp_prefix_is([NBKERNEL, KERMACH, MALIGNER]):
            assert self.pp_prefix_is([UNIVEC, BIVEC, DSCTOKVEC])
        if self.expcfg.rm_fn_word:
            # Internally Manipulates smat
            self.remove_fn_word()
        if self.expcfg.weight_method.endswith('/df'):
            self.populate_idf()
        return
Esempio n. 9
0
def main():
    new_dict = {}
    with rasengan.tictoc('Extracting Features'):
        for k in vertex_dict.keys():
            v = vertex_dict[k]
            new_dict[k] = Entity(v.guid, v.name, v.confidence, v.featsets,
                                 extract_feature_from_entity(v))

    with rasengan.tictoc('Pickling'):
        with open(os.path.expanduser(args.out_fn), 'wb') as f:
            pkl.dump(dict(vertex_dict=new_dict,
                          edgelist=edgelist,
                          TOTAL_FEATURES=TOTAL_FEATURES,
                          PERFECT_HASH=PERFECT_HASH),
                     f,
                     protocol=-1)
def main():
    with open(args.graph_fn) as f:
        data = pickle.load(f)
        pass
    adjacent_node_dict = data['graph']
    edge_prob_dict = data['graph_weights']
    for vertex in edge_prob_dict:
        s = edge_prob_dict[vertex].sum()
        edge_prob_dict[vertex] = edge_prob_dict[vertex] / s
    queries = read_flatfile_query_fn(args.query_fn)
    vertex_count = len(adjacent_node_dict)
    query_visit = {}
    with rasengan.tictoc("Random Walks"):
        for qid, query in queries.iteritems():
            for start in query:
                start = int(start)
                query_visit[qid, start] = weighted_random_walk(
                    vertex_count,
                    start,
                    adjacent_node_dict,
                    edge_prob_dict,
                    path_maxlength=args.path_maxlength,
                    n_runs=args.n_runs)
    with open(args.out_fn, 'wb') as ofh:
        pickle.dump(query_visit, ofh)
    return
Esempio n. 11
0
def read_queries(query_fn):
    queries = {}
    with rasengan.tictoc("Reading Queries"):
        with open(query_fn) as f:
            for row in f:
                row = row.strip().split()
                qid = row[0]
                queries[qid] = [int(e) for e in row[1:]]
    return queries
Esempio n. 12
0
def entity_list_to_dscfeat_csr_mat(cfg, catpeople):
    url_list = catpeople['__URL_LIST__']
    yield_suf = cfg._name.startswith(DSCSUF)
    shape = (len(url_list), get_width_for_unisuf() if yield_suf else len(TM))
    with rasengan.tictoc('Loading Parses'):  # 1 min
        PARSES = pkl.load(util_catpeople.proj_open(cfg.parsefn))
    print 'Total Rows:', len(url_list)
    iterator = (get_dscfeat_from_catpeople_entity(catpeople[url], cfg, PARSES,
                                                  yield_suf)
                for url in url_list)
    return csr_mat_builder(iterator, shape=shape, verbose=0)
 def ls_exp_call_impl(self, fold_idx, cat, train_idx, test_idx):
     mat, features = self.create_mat_features(cat, train_idx)
     print 'Using %d Features For Category %s' % (len(features), cat)
     if len(features) == 0:
         self.pa(cat, [], [], [])
         return
     # ------------------------ #
     # Start Training / Testing #
     # ------------------------ #
     set_train_idx = set(train_idx)
     set_test_idx = set(test_idx)
     features = sorted(features)
     mat, needles_in_haystack = self.get_mat_needles_in_haystack(
         features, [
             i
             for i in xrange(self.smat.shape[0]) if (i not in set_train_idx)
         ])
     labels = [(i in set_test_idx) for i in xrange(self.smat.shape[0])
               if (i not in set_train_idx)]
     if self.expcfg.lsvc_loss == 'logloss':
         classifier = LogisticRegression(C=self.expcfg.lsvc_C,
                                         intercept_scaling=1000,
                                         penalty=self.expcfg.lsvc_penalty,
                                         dual=False,
                                         tol=1e-4,
                                         fit_intercept=True,
                                         verbose=0,
                                         random_state=args.seed,
                                         max_iter=1000)
     else:
         classifier = LinearSVC(C=self.expcfg.lsvc_C,
                                intercept_scaling=1000,
                                penalty=self.expcfg.lsvc_penalty,
                                loss=self.expcfg.lsvc_loss,
                                dual=False,
                                tol=1e-4,
                                fit_intercept=True,
                                verbose=0,
                                random_state=args.seed,
                                max_iter=1000)
     with rasengan.tictoc('Fitting', timer='total_time'):
         classifier.fit(needles_in_haystack, labels)
     scores = classifier.decision_function(mat)
     # classifier.sparsify()
     self.pa(cat,
             scores,
             train_idx,
             test_idx,
             scratch=dict(coef=classifier.coef_,
                          intercept=classifier.intercept_,
                          features=features))
     return mat, classifier
Esempio n. 14
0
 def create_AT(self, arr_gen, intmdt_fn=None):
     # TODO: Make a shortcut, it the array to be generated already exists
     try:
         I = arr_gen.I
     except AttributeError:
         I = arr_gen[0].shape[0]
     AT_arr_shape = (I, self.intermediate_dim * len(arr_gen))
     if intmdt_fn is None:
         print "Allocating array of size", AT_arr_shape
         AT_arr = numpy.empty(AT_arr_shape, dtype='float32', order='C')
     else:
         AT_arr = numpy.memmap(intmdt_fn,
                               dtype='float32',
                               mode='w+',
                               shape=AT_arr_shape,
                               order='C')
     transform_f = VT.parse(self.view_transform)
     for arr_idx, _arr in enumerate(arr_gen):
         # arr = numpy.asfortranarray(transform_f(_arr))
         arr_ = transform_f(_arr)
         arr = arr_.tocsc()
         if arr is not _arr:
             del _arr
         if arr is not arr_:
             del arr_
         print >> sys.stderr, arr_idx, arr.shape, arr.max(), arr.min()
         print_config(msg='Started SVDS')
         with tictoc('Timing SVD', override='stderr'):
             [A, S, B] = sparse_svd(arr,
                                    self.intermediate_dim,
                                    method=self.svd_method)
         print_config(msg='Finished SVDS')
         if self.mean_center:
             if B is not None:
                 [A, S, B] = lib_linalg.mean_center(A, S, B, arr)
                 del B
             else:
                 [A, S] = lib_linalg.mean_center(A, S, arr)
         A *= self.create_T(S)
         begin = self.intermediate_dim * arr_idx
         end = self.intermediate_dim * (arr_idx + 1)
         AT_arr[:, begin:end] = A
         del A, S, B
         print_config(msg='Finished processing Array: ' + str(arr_idx))
         if intmdt_fn is not None:
             AT_arr.flush()
     return AT_arr
Esempio n. 15
0
def update_shelf():
    url_mention = DbfilenameShelf(args.in_shelf, protocol=-1)
    TM = url_mention['__TOKEN_MAPPER__']
    TM.finalize(catpeople_baseline_nb_config.MAX_TOK)
    E = url_mention['__URL_LIST__']
    n_doc = 10000
    with rasengan.tictoc('Extracting Contexts'):
        df_obj = TextualClueObject(E[:n_doc], url_mention, TM)
    df = defaultdict(int)
    for features in df_obj.features.itervalues():
        for f in features:
            df[f] += 1
    for f in df.keys():
        df[f] = df[f] / float(n_doc)
    url_mention['__DF__'] = dict(df)
    url_mention.close()
    return
def make(args, force=False, pipeline=False):
    ''' In each training run we have to check whether a trained pickle
    file `saveto` for that stage already exists. If it exists then we
    skip the training. Otherwise we load the trained parameters and
    when we leave then we set the pretrained_param_pklfile as the
    pkl file that we just saved parameters to (or that already exists).

    Also we restore the state of perform_training and saveto to defaults.
    Params
    ------
    args  :
    saveto :
    force  : (default False)
    Returns
    -------
    '''
    saveto = os.path.join(args.folder, args.pkl_name)
    with rasengan.tictoc('Making ' + saveto):
        rasengan.ensure_dir(args.folder, verbose=1, treat_as_dir=1)
        if hasattr(args, 'saveto'):
            assert args.saveto == saveto, str((args.saveto, saveto))
        else:
            args.saveto = saveto
            print 'Set args.saveto=', args.saveto
        # Check whether we need to do any training unless forced
        # explicitly.
        pt = args.perform_training
        if not force and os.path.exists(args.saveto):
            args.perform_training = 0
        rasengan.increase_print_indent()
        #----#
        yield
        #----#
        rasengan.decrease_print_indent()
        args.perform_training = pt
        if pipeline:
            # Set the pretrained_param_pklfile field to a value after saving
            # parameters to that location.
            args.pretrained_param_pklfile = args.saveto
            # Reset args.saveto to null
            args.saveto = None
def make(args, force=False, pipeline=False):
    ''' In each training run we have to check whether a trained pickle
    file `saveto` for that stage already exists. If it exists then we
    skip the training. Otherwise we load the trained parameters and
    when we leave then we set the pretrained_param_pklfile as the
    pkl file that we just saved parameters to (or that already exists).

    Also we restore the state of perform_training and saveto to defaults.
    Params
    ------
    args  :
    saveto :
    force  : (default False)
    Returns
    -------
    '''
    saveto = os.path.join(args.folder, args.pkl_name)
    with rasengan.tictoc('Making ' + saveto):
        rasengan.ensure_dir(args.folder, verbose=1, treat_as_dir=1)
        if hasattr(args, 'saveto'):
            assert args.saveto == saveto, str((args.saveto, saveto))
        else:
            args.saveto = saveto
            print 'Set args.saveto=', args.saveto
        # Check whether we need to do any training unless forced
        # explicitly.
        pt = args.perform_training
        if not force and os.path.exists(args.saveto):
            args.perform_training = 0
        rasengan.increase_print_indent()
        #----#
        yield
        #----#
        rasengan.decrease_print_indent()
        args.perform_training = pt
        if pipeline:
            # Set the pretrained_param_pklfile field to a value after saving
            # parameters to that location.
            args.pretrained_param_pklfile = args.saveto
            # Reset args.saveto to null
            args.saveto = None
Esempio n. 18
0
def entity_list_to_ngram_csr_mat(cfg,
                                 catpeople,
                                 width=None,
                                 n=0,
                                 add_governor_arc_label=False):
    assert n in [0, 1]
    url_list = catpeople['__URL_LIST__']
    shape = (len(url_list), len(TM) if width is None else width)
    PARSES = None
    if add_governor_arc_label:
        assert n == 0
        with rasengan.tictoc('Loading Parses'):  # 1 min
            PARSES = pkl.load(util_catpeople.proj_open(cfg.parsefn))
        iterator = (get_ngrams_from_catpeople_entity(n,
                                                     catpeople[url],
                                                     cfg,
                                                     PARSES,
                                                     yield_nsuf=True)
                    for url_idx, url in enumerate(url_list))
    else:
        iterator = (get_ngrams_from_catpeople_entity(n, catpeople[url], cfg,
                                                     None)
                    for url_idx, url in enumerate(url_list))
    return csr_mat_builder(iterator, shape=shape, verbose=0)
Esempio n. 19
0
def main():
    import argparse
    arg_parser = argparse.ArgumentParser(description='')
    arg_parser.add_argument('--seed', default=0, type=int, help='Default={0}')
    arg_parser.add_argument('--emb_pkl_fn',
                            default='data/demonstrate_similarity_idea.emb.pkl',
                            type=str)
    arg_parser.add_argument(
        '--feat_file',
        default='data/random/details/89c0c894.American_women_writers',
        type=str)
    arg_parser.add_argument('--ctag', default=None, type=int)
    arg_parser.add_argument('--mode_count', default=5, type=int)
    arg_parser.add_argument('--method',
                            default='fast_relax',
                            type=str,
                            choices=[
                                'brute_force', 'fast_relax', 'annealed_gibbs',
                                'maxproduct-bp', 'variational_inference',
                                'dc_programming'
                            ])
    args = arg_parser.parse_args()
    import random
    random.seed(args.seed)
    numpy.random.seed(args.seed)
    cfg.mode_count = args.mode_count
    tags_to_remove = defaultdict(list)
    with rasengan.tictoc('Loading pkl'):
        embeddings = pkl.load(open(args.emb_pkl_fn))
        if cfg.introduce_NULL_embedding:
            embeddings[cfg.NULL_KEY] = numpy.zeros(
                next(embeddings.itervalues()).shape)
    with rasengan.debug_support():
        for mode_idx in range(cfg.mode_count):
            print 'mode_idx=', mode_idx
            entity_tags = {}
            entities = []
            for row in open(args.feat_file):
                _e, _tags = [e.strip() for e in row.strip().split('|||')]
                entities.append(_e)
                entity_tags[_e] = set([
                    t.lower()
                    for t in (e.strip().split(':')[0] for e in _tags.split())
                    if t.lower() in embeddings
                ])
            total_tags = set(
                rasengan.flatten([list(e) for e in entity_tags.values()]))
            assert all(e in embeddings for e in total_tags)
            print(
                'For each of these people our goal is to select one word.'
                ' That word should be as similar to other words picked for other'
                ' entities as possible')

            problem = rasengan.OrderedDict_Indexable_By_StringKey_Or_Index()
            for (a, b) in entity_tags.items():
                b = list(b)
                print 'Entity: ', a, 'tags to remove: ', tags_to_remove[a]
                for ttr in tags_to_remove[a]:
                    tolerant_remove(b, ttr)
                if cfg.introduce_NULL_embedding and cfg.NULL_KEY not in b:
                    b.append(cfg.NULL_KEY)
                # print '%-25s' % a, '|||', ', '.join(b)
                problem[a] = DataFrame(data=numpy.concatenate(
                    [(scale_to_unit(embeddings[e])
                      if cfg.scale_to_unit else embeddings[e])[None, :]
                     for e in b],
                    axis=0),
                                       index=b)
            if args.ctag is None:
                initial_assignment = dict(
                    (__a, 0) for __b, __a in enumerate(entities))
            else:
                ctag = 'war'.split()[args.ctag]
                initial_assignment = dict(
                    (__e,
                     (cfg.NULL_KEY if ctag not in entity_tags[__e] else ctag))
                    for __e in entities)
            print 'Initial chosen tags::', chosen_tags(problem,
                                                       initial_assignment)
            initial_objective = dp_objective_efficient_impl(
                problem, initial_assignment)
            print 'initial_objective=', initial_objective
            assert numpy.isclose(
                dp_objective_naive_impl(problem, initial_assignment),
                initial_objective)
            final_assignment = optimize_assignment(problem,
                                                   initial_assignment,
                                                   method=args.method)
            final_objective = dp_objective_efficient_impl(
                problem, final_assignment)
            for (fa_entity, fa_tag_idx) in final_assignment.iteritems():
                tags_to_remove[fa_entity].append(
                    liloc(problem[fa_entity], fa_tag_idx).name)
            print 'mode_idx=', mode_idx,
            print 'initial_objective=', initial_objective,
            print 'final_objective=', final_objective,
            print 'Final chosen tags=', chosen_tags(problem, final_assignment)
    return
                print preamble, 'AUPR=%.3f' % rasengan.rank_metrics.average_precision(
                    testing_output),
                testing_output = [(1 if e in set_I else 0)
                                  for e in range(total_persons)
                                  if e not in set_train_idx]
                sto = sum([e for e in testing_output])
                e0_to = [e for e in testing_output]
                print 'CORRECTAUPR=%.3f' % rasengan.rank_metrics.average_precision(
                    e0_to), \
                    'CORRECTP@10=%.3f' % (rasengan.rank_metrics.precision_at_k(
                        e0_to, 10) if sto > 10 else -1), \
                    'CORRECTP@100=%.3f' % (rasengan.rank_metrics.precision_at_k(
                        e0_to, 100) if sto > 100 else -1)
                continue

            with rasengan.tictoc('Writing graph file'):
                with open('graph_file', 'wb') as f:
                    for row, col, val in zip(*scipy.sparse.find(s_features)):
                        if col != predicate_idx and (dnd != 'wo_doc' or col
                                                     not in docfeat_idx):
                            f.write('%d\t%d\t%.4f\n' %
                                    (row, total_persons + col, val))

            with rasengan.tictoc('Writing Seed File'):
                with open('seed_file', 'wb') as f:
                    for node in train_idx:
                        if node in set_I:
                            f.write('%d\tL1\t1\n' % node)
                        else:
                            f.write('%d\tL0\t1\n' % node)
Esempio n. 21
0
from rasengan import rank_metrics
import os
import ipdb as pdb
import igraph
import argparse
import itertools

arg_parser = argparse.ArgumentParser(description='')
arg_parser.add_argument('--seed', default=0, type=int)
arg_parser.add_argument('--rw_walk_num', default=10, type=int)
arg_parser.add_argument('--rw_max_step', default=3, type=int)
args = arg_parser.parse_args()
random.seed(args.seed)
np.random.seed(args.seed)

with rasengan.tictoc('Setup'):
    IDX_PKL_FN = r'../../scratch/relational_bbn2_train_test_idx.pkl'
    fn = os.path.expanduser(
        '~/data/tackbp2015bbn2/basicfeaturization_relational_bbn2.pkl')
    data = pkl.load(open(fn))
    vertex_dict = data['vertex_dict']
    edgelist = data['edgelist']
    TOTAL_FEATURES = data['TOTAL_FEATURES']
    F2I_MAP = data['PERFECT_HASH']
    I2F_MAP = dict((a, b) for (b, a) in F2I_MAP.iteritems())
    guid_list = vertex_dict.keys()
    vertices = [vertex_dict[e] for e in guid_list]
    features = [v.features for v in vertices]
    row_names = [v.name for v in vertices]

    def index_row_names(idi):
Esempio n. 22
0
 def process_AT(self, AT_arr, debug=False):
     print_config(msg='Started svd_1')
     with tictoc('Performing Final SVD', override='stderr'):
         [G, i] = lib_linalg.svd_1(AT_arr, debug=debug)
     print_config(msg='Finished svd_1')
     return G
Esempio n. 23
0
| Last-Updated: Thu Sep  1 13:42:50 2016 (-0400)
|           By: Pushpendre Rastogi
|     Update #: 10
'''
import cPickle as pkl
from wikilink_category_to_url_and_count_reverse_index import WikilinkReverseIndex
import argparse
import rasengan

arg_parser = argparse.ArgumentParser(description='')
arg_parser.add_argument(
    '--caturl_pkl',
    default='data/wikilink_category_to_url_and_count_reverse_index.pkl',
    type=str)
arg_parser.add_argument('--chosen_cat',
                        default='data/chosen_wikilink_categories',
                        type=str)
arg_parser.add_argument('--mention_thresh', default=10, type=int)
args = arg_parser.parse_args()
chosen_cat = set(
    [e.strip().split()[0] for e in open(args.chosen_cat) if e != '\n'])
with rasengan.tictoc('loading pkl', override='stderr'):
    caturl = pkl.load(open(args.caturl_pkl))
for row in open(args.chosen_cat):
    if row == '\n':
        continue
    cat = row.strip().split()[0]
    for url, cnt in caturl[cat]:
        if cnt >= args.mention_thresh:
            print cat, url
Esempio n. 24
0
                        default=None,
                        type=str,
                        help='Default={None}')
arg_parser.add_argument('--cache_fn',
                        default=None,
                        type=str,
                        help='Default={None}')
arg_parser.add_argument('--leaf_fn',
                        default=None,
                        type=str,
                        help='Default={None}')
args = arg_parser.parse_args()
# ------------------- #
# Initialize Globals. #
# ------------------- #
with rasengan.tictoc('Initializing Globals'):
    CFG = rasengan.deep_namespacer(
        yaml.load(open('relationalize_base_graph.yaml').read()))
    FOREIGN_NS, BASE_NS = pickle.load(open(args.cache_fn, 'rb'))
ORG_TYPES = [
    'adept-core#OrgHeadquarter', 'adept-core#Organization',
    'adept-core#OrganizationWebsite', 'adept-core#StartOrganization',
    'adept-core#EndOrganization', 'adept-core#Membership',
    'adept-core#Subsidiary'
]
NONRELATIONAL_TYPES = [
    'adept-base#Date', 'adept-core#Crime', 'adept-core#GeoPoliticalEntity',
    'adept-core#Person', 'adept-core#Title', 'adept-core#URL'
]

# ---------------- #
def train_transducer_lbfgs(
        train_lex, train_y, args, ttns, training_stats, batch_size=None):
    ''' This function completes a training epoch by doing one run of LBFGS.
    `ts` abbreviates `train_stack` in entire function

    Params
    ------
    train_lex      : A list of input_strings (the strings are represented as np arrays)
    train_y        : A list of output strings
    batch_size     : UNUSED : (default None)
    '''
    assert args.clipping_value < 0
    assert args.projection_threshold < 0

    ts_param_name = [
        str(e) for e in ttns.train_stack_config.updatable_parameters()]
    print 'The following params will be trained by lbfgs', ts_param_name
    ts_param_shape_list = [ttns.train_stack_config[name].get_value().shape
                           for name in ts_param_name]
    ts_param_shape_map = dict(zip(ts_param_name, ts_param_shape_list))

    total_param = sum(numpy.prod(shape)
                      for shape
                      in ts_param_shape_map.values())

    def set_entries_in_ttns(param_vec):
        ''' Set entries in ttns.train_stack_config
        with corresponding values in param_vec.
        '''
        param_vec = param_vec.astype('float32')
        offset = 0
        for name in ts_param_name:
            shape = ts_param_shape_map[name]
            numel = numpy.prod(shape)
            ttns.train_stack_config[name].set_value(
                param_vec[offset:offset + numel].reshape(shape))
            offset += numel
            pass
        return

    def vectorize(param_list, dtype='float32'):
        param_vec = numpy.zeros((total_param,), dtype=dtype)
        offset = 0
        for idx, param in enumerate(param_list):
            shape = param.shape
            assert shape == ts_param_shape_list[idx]
            numel = numpy.prod(shape)
            param_vec[offset:offset + numel] = param.reshape((numel,)).astype(dtype)
            offset += numel
            pass
        return param_vec

    def get_entries_in_ttns():
        ''' Set entries in ttns.train_stack_config
        with corresponding values in param_vec.
        '''
        return vectorize(
            [ttns.train_stack_config[name].get_value()
             for name
             in ts_param_name])

    def loss_over_corpus(param_vec):
        ''' Compute the loss value over the entire corpus.
        '''
        set_entries_in_ttns(param_vec)
        corpus_cost = 0
        for idx in range(len(train_lex)):
            input_string = train_lex[idx]
            output_string = train_y[idx]
            corpus_cost += ttns.train_f_cost(input_string, output_string)
        return corpus_cost / len(train_lex)

    def gradient_over_corpus(param_vec):
        set_entries_in_ttns(param_vec)
        corpus_grad = numpy.zeros((total_param,), dtype='float64')
        for idx in range(len(train_lex)):
            input_string = train_lex[idx]
            output_string = train_y[idx]
            tmp_grad = ttns.train_f_grad(input_string, output_string)
            corpus_grad += vectorize(tmp_grad, 'float64')
        return corpus_grad / len(train_lex)

    with rasengan.tictoc("Training %d epoch"%training_stats['epoch_id']):
        init_param = get_entries_in_ttns()
        rasengan.warn('Skipped FD Check')
        # print 'Check grad output: Error=', scipy.optimize.check_grad(func=loss_over_corpus, grad=gradient_over_corpus, x0=init_param)
        opt_param = scipy.optimize.fmin_l_bfgs_b(
            loss_over_corpus, init_param,
            fprime=gradient_over_corpus, disp=2, maxiter=1000)[0]
        set_entries_in_ttns(opt_param)
    return
Esempio n. 26
0
#!/usr/bin/env python
'''
| Filename    : wikimic_create_entity_token_set.ipkl.py
| Description : Convert a giant pickle file that can't be processed
|               conveniently with other data into a streaming pkl.
| Author      : Pushpendre Rastogi
| Created     : Fri Aug  5 11:37:23 2016 (-0400)
| Last-Updated: Fri Aug  5 11:47:29 2016 (-0400)
|           By: Pushpendre Rastogi
|     Update #: 1
'''
import argparse
from rasengan import sPickle, tictoc
arg_parser = argparse.ArgumentParser(description='')
arg_parser.add_argument('--in_fn', type=str)
arg_parser.add_argument('--out_fn', type=str)
args = arg_parser.parse_args()
with tictoc('Loading Data'):
    import cPickle as pkl
    data = pkl.load(open(args.in_fn))
with tictoc('Writing Data'):
    with open(args.out_fn, 'wb') as f:
        sPickle.s_dump(data.iteritems(), f)
Esempio n. 27
0
|     Update #: 24
'''
import cPickle as pkl
import rasengan
import yaml
# yaml_data = yaml.load(open('data/women_writer_manual_clues.yaml'))
# tags = set(
# rasengan.flatten([(rasengan.flatten(b[1::2])) for b in
# yaml_data.values()]))

tags = {}

for row in open('data/entity_descriptors_procoref~1.psv'):
    entity, _tags = [e.strip() for e in row.strip().split('|||')]
    for t in (e.strip().split(':')[0] for e in _tags.split()):
        tags[t] = None
        tags[t.lower()] = None

print len(tags)
with rasengan.tictoc('Loading MVLSA emb'):
    data = pkl.load(open(
        '/Users/pushpendrerastogi/data/embedding/mvlsa/combined_embedding_0.emb.pkl'))
tag_emb = {}
for tag in tags:
    try:
        tag_emb[tag] = data[tag]
    except KeyError:
        print tag
with open('data/demonstrate_similarity_idea.emb.pkl', 'wb') as f:
    pkl.dump(tag_emb, f, protocol=-1)
Esempio n. 28
0
                     len(fold[1])] + self.get_fold_stats(fold))

    def fold_stats(self, add_cat=False):
        if add_cat:
            return [[cat, fold_idx] + self.get_fold_stats(fold)
                    for cat in self.record
                    for fold_idx, fold in enumerate(self.record[cat])]
        else:
            return [
                self.get_fold_stats(fold) for cat in self.record
                for fold in self.record[cat]
            ]

    def __str__(self):
        fold_stats = self.fold_stats()
        return ('(AUPR %.3f %.3f) (P@10 %.3f %.3f) (P@100 %.3f %.3f) '
                '(MRR %.3f %.3f)') % tuple(
                    numpy.array(fold_stats).mean(axis=0).tolist())


if __name__ == '__main__':
    import cPickle as pkl
    for f in [
            '/export/b15/prastog3/catpeople_experiment.ppcfg~8.expcfg~303.pkl'
    ]:
        print '--------- FILE: ', f
        with tictoc('Loading Pkl'):
            data = pkl.load(open(f))
        # data.limit=1000
        print data
Esempio n. 29
0
|     Update #: 28
'''
import cPickle as pkl
import os.path
import bz2
import rasengan
from collections import defaultdict
opj = os.path.join
dbpdir = os.path.expanduser('~/Downloads/dbpedia')
from util_wikiurl import simplify_wiki_url
DBPEDIA_PREF_LEN = len('<http://dbpedia.org/resource/')
CAT_PREF_LEN = len('Category:')

cat_index = {}
row_idx = 0
with rasengan.tictoc('LOADING CATEGORY INDEX FROM DBPEDIA'):  # 100s
    for row in bz2.BZ2File(opj(dbpdir, 'article_categories_en.ttl.bz2')):
        # Discard rows that start with '#' since they are comments.
        if row.startswith('#'):
            continue
        row = row.strip().split()
        cat = row[2][DBPEDIA_PREF_LEN + CAT_PREF_LEN:-1]
        if cat not in cat_index:
            cat_index[cat] = row_idx
            row_idx += 1

with open('data/dbpedia_cat_index.pkl', 'wb') as f:
    pkl.dump(cat_index, f)

with rasengan.tictoc('LOADING ARTICLE to CATEGORY MAP FROM DBPEDIA'):  # 121s
    art_cat = defaultdict(list)
def main():
    with rasengan.tictoc('Yaml Loading'):
        cfg = rasengan.deep_namespacer(
            yaml.load(open('relationalize_base_graph.yaml')))
    feat_strings = []
    for k in (_ for _ in cfg.features
              if _ not in ['adept-core#ChargeIndict', 'adept-core#BeBorn']):
        v = [
            e for e in cfg.features[k].keys()
            if e not in ['person', 'document', 'confidence']
        ]
        for e in v:
            feat_strings.append(k + '~' + e + '~name~')
    fn = ('/Users/pushpendrerastogi/data/'
          'tackbp2015bbn2/basicfeaturization_relational_bbn2.pkl')
    data = pkl.load(open(fn))
    vertex_dict = data['vertex_dict']
    edgelist = data['edgelist']
    TOTAL_FEATURES = data['TOTAL_FEATURES']
    F2I_MAP = data['PERFECT_HASH']
    I2F_MAP = dict((a, b) for (b, a) in F2I_MAP.iteritems())
    TOTAL_PERSONS = len(vertex_dict)
    with rasengan.tictoc('docs creation'):
        docs = set([
            fs['~document'] for v in vertex_dict.values() for fs in v.featsets
        ])

    with rasengan.tictoc('s_features creation'):
        guid_list = vertex_dict.keys()
        vertices = [vertex_dict[e] for e in guid_list]
        features = [v.features for v in vertices]
        row_names = [v.name for v in vertices]
        data = []
        row = []
        col = []
        for r, f in enumerate(features):
            for c, d in f:
                data.append(d)
                col.append(c)
                row.append(r)

        s_features = csc_matrix((data, (row, col)),
                                shape=[len(vertex_dict), TOTAL_FEATURES])

    feature_occurrence = lambda rpfx: [(e, v, s_features[:, v].getnnz(
    ), s_features[:, v].sum()) for e, v in F2I_MAP.items()
                                       if e.startswith(rpfx)]

    high_occurrence_feat = lambda rpfx: sorted(
        [e for e in feature_occurrence(rpfx) if e[2] > 1],
        key=lambda x: x[2],
        reverse=True)
    # adept-core#Origin~origin~name~"Israel"
    # adept-core#Resident~location~name~"United States"
    # adept-core#Die~pod~name~"Iraq"
    # adept-core#Die~pod~name~"United States"
    # adept-core#Die~pod~name~"Pakistan"
    # adept-core#Leadership~subject_org~name~"Democrats"
    # adept-core#StudentAlum~almamater~name~"Harvard University"
    # adept-core#InvestorShareholder~invested_org~name~"Chrysler"
    # adept-core#InvestorShareholder~invested_org~name~"Boston Globe"
    # adept-core#InvestorShareholder~invested_org~name~"New York Post"
    # adept-core#EmploymentMembership~employer~name~"United States"
    # adept-core#Role~role~name~"manager"
    # adept-core#Founder~founded_org~name~"Church"
    # adept-core#Founder~founded_org~name~"Solamere Capital"
    # adept-core#Founder~founded_org~name~"Tesla Motors"
    # We used the high_occurrence_feat function to find out the right
    # features to use.
    if LOAD_DATA:
        data_used = pkl.load(open(IDX_PKL_FN, 'rb'))
    else:
        data_used = defaultdict(dict)
    for feat in [
            'adept-core#EmploymentMembership~employer~name~"Army"',
            'adept-core#EmploymentMembership~employer~name~"White House"',
            'adept-core#Leadership~subject_org~name~"Democratic"',
            'adept-core#Leadership~subject_org~name~"Parliament"',
            'adept-core#Origin~origin~name~"American"',
            'adept-core#Origin~origin~name~"Russia"',
            'adept-core#Resident~location~name~"Chinese"',
            'adept-core#Resident~location~name~"Texas"',
            'adept-core#Role~role~name~"author"',
            'adept-core#Role~role~name~"director"',
            'adept-core#StudentAlum~almamater~name~"Harvard"',
            'adept-core#StudentAlum~almamater~name~"Stanford"'
    ]:
        feat_idx = F2I_MAP[feat]
        I = list(s_features[:, feat_idx].nonzero()[0])
        sI = set(I)
        Ic = list([_ for _ in range(s_features.shape[0]) if _ not in sI])
        random.shuffle(I)
        random.shuffle(Ic)

        # preamble = '\npfx=%s feature_rank=%d feat=%s' % (
        #     pfx, feature_rank, feat)
        for trials in range(5):
            preamble = 'feat=%s' % feat
            try:
                if len(I) < 10:
                    raise DatasetTooSmall
                ds = Dataset(row_names,
                             s_features,
                             I,
                             Ic,
                             I2F_MAP,
                             perma_mask=[feat_idx],
                             test_size_by2=min(25,
                                               len(I) / 2))
                if not LOAD_DATA:
                    data_used[feat][trials] = dict(
                        train=ds.get_train_set_idx(5),
                        test=ds.get_test_set_idx())
                assert feat_idx not in ds.col_idx_to_keep()
            except DatasetTooSmall:
                print preamble, '\nTest Set too big'
                continue
            if not LOAD_DATA:
                continue
            for train_size_by2 in [5]:  # (2, 5, 10, 20):
                for mask_pattern in (re.compile('~document~.*'),
                                     re.compile('XXXX')):
                    print preamble, 'train_size_by2=%d' % train_size_by2, \
                        'mask_pattern.pattern=%s' % mask_pattern.pattern
                    train_idx = (data_used[feat][trials]['train']
                                 if LOAD_DATA else None)
                    if USE_SMALL_TEST_SET:
                        test_idx = (data_used[feat][trials]['test']
                                    if LOAD_DATA else None)
                    else:
                        set_train_idx = set(train_idx)
                        test_idx = ([
                            i for i in range(TOTAL_PERSONS)
                            if i not in set_train_idx
                        ] if LOAD_DATA else None)
                    binary_linear_classifier_diagnostics(
                        ds,
                        train_size_by2=train_size_by2,
                        mask_pattern=mask_pattern,
                        train_idx=train_idx,
                        test_idx=test_idx)
    # The feature runs are over.
    if not LOAD_DATA:
        with open(IDX_PKL_FN, 'wb') as f:
            pkl.dump(dict(data_used), f, protocol=-1)
Esempio n. 31
0
    return (float(x) * y) / (x + y) / 2


# --------------------------- #
# BEGIN SCRIPT FUNCTIONALITY  #
# --------------------------- #
import argparse
arg_parser = argparse.ArgumentParser(description='')
arg_parser.add_argument('--cnt_transform',
                        default='GM_SQRT_FREQ_SQRT_COUNT',
                        type=str)
arg_parser.add_argument('--print_entity_list', default=1, type=int)
arg_parser.add_argument('--intervene_modes', default=1, type=int)
args = arg_parser.parse_args()

with tictoc('Loading emb pkl'):
    dcr2emb = pkl.load(open('data/demonstrate_similarity_idea.emb.pkl'))
    for e in dcr2emb:
        dcr2emb[e] = scale_to_unit(dcr2emb[e])
    cat2mode = get_cat2mode()
    CONSTANT = (lambda x, t: 1)
    COUNT = (lambda x, t: x)
    LOG_COUNT = (lambda x, t: math.log(1 + x))
    SQRT_COUNT = (lambda x, t: math.sqrt(x))
    FREQ = (lambda x, t: float(x + 1) / (t + 1))
    SQ_FREQ = (lambda x, t: (float(x + 1) / (t + 1))**2)
    SQRT_FREQ = (lambda x, t: math.sqrt(float(x + 1) / (t + 1)))
    PROD_SQRT_FREQ_SQRT_COUNT = (
        lambda x, t: SQRT_COUNT(x, t) * SQRT_FREQ(x, t))
    GM_SQRT_FREQ_SQRT_COUNT = (
        lambda x, t: math.sqrt(SQRT_COUNT(x, t) * SQRT_FREQ(x, t)))
Esempio n. 32
0
def main(args):
    with rasengan.debug_support():
        with rasengan.tictoc("Loading Data"):
            data_list = rasengan.namespacer(
                read_data(args.train_fn))
            val_data_list = rasengan.namespacer(
                read_data(args.dev_fn))
            if args.partition_dev_into_train > 0:
                lim = args.partition_dev_into_test
                data_list.extend(val_data_list[lim:])
                val_data_list = val_data_list[:lim]

            if args.partition_dev_into_test > 0:
                lim = args.partition_dev_into_test
                test_data_list = val_data_list[lim:]
                val_data_list = val_data_list[:lim]
            else:
                test_data_list = rasengan.namespacer(
                    read_data(args.test_fn))

            # data_list = val_data_list = [(u'jason', u'eisner')]
            lst_char = get_lst_char(data_list
                                    + val_data_list
                                    + test_data_list)
            data_list = add_bos(data_list)
            val_data_list = add_bos(val_data_list)
            test_data_list = add_bos(test_data_list)
            warnings.warn('''
            NOTE: While preparing sigma, we add 1 to the index
            returned by enumerate because the transducer unit that
            Ryan wrote uses index 0 as the index for the epsilon
            symbol. So essentially the epsilon symbol and the
            integer 0 are reserved symbols that cannot appear in the
            vocabulary.

            ALSO, we need to add 1 to the vocsize because of that.
            ''')
            # sigma :: char -> int
            sigma = dict((b, a+1) for (a,b) in enumerate(lst_char))

            # sigma_inv :: int -> char
            sigma_inv = dict((a+1, b) for (a,b) in enumerate(lst_char))

            if args.limit_corpus > 0:
                data_list = data_list[:args.limit_corpus]

            train_data = numerize(data_list, sigma, args.win)
            val_data = numerize(val_data_list, sigma, args.win)
            test_data = numerize(test_data_list, sigma, args.win)

            data = rasengan.Namespace()

            #-------------------------------------------------------------#
            # Add sets that would be used by the tensorflow seq2seq       #
            # model. See~$PY/tensorflow/models/rnn/translate/translate.py #
            #-------------------------------------------------------------#
            data.train_data = data_list
            data.val_data = val_data_list
            data.test_data = test_data_list

            data.train_set = train_data
            data.dev_set = val_data
            data.test_set = test_data

            data.vocsize = len(sigma) + 1
            data.idx2label = sigma_inv
            data.label2idx = sigma

            data.train_lex = [e[0] for e in train_data]
            data.train_y = [e[1] for e in train_data]

            data.valid_lex = [e[0] for e in val_data]
            data.valid_y = util_lstm_seqlabel.convert_id_to_word(
                [e[1] for e in val_data], data.idx2label)

            data.test_lex = [e[0] for e in test_data]
            data.test_y = util_lstm_seqlabel.convert_id_to_word(
                [e[1] for e in test_data], data.idx2label)

            data.words_train = []
            data.words_valid = []
            data.words_test = []
    return data
Esempio n. 33
0
def train_transducer_lbfgs(train_lex,
                           train_y,
                           args,
                           ttns,
                           training_stats,
                           batch_size=None):
    ''' This function completes a training epoch by doing one run of LBFGS.
    `ts` abbreviates `train_stack` in entire function

    Params
    ------
    train_lex      : A list of input_strings (the strings are represented as np arrays)
    train_y        : A list of output strings
    batch_size     : UNUSED : (default None)
    '''
    assert args.clipping_value < 0
    assert args.projection_threshold < 0

    ts_param_name = [
        str(e) for e in ttns.train_stack_config.updatable_parameters()
    ]
    print('The following params will be trained by lbfgs', ts_param_name)
    ts_param_shape_list = [
        ttns.train_stack_config[name].get_value().shape
        for name in ts_param_name
    ]
    ts_param_shape_map = dict(zip(ts_param_name, ts_param_shape_list))

    total_param = sum(
        numpy.prod(shape) for shape in ts_param_shape_map.values())

    def set_entries_in_ttns(param_vec):
        ''' Set entries in ttns.train_stack_config
        with corresponding values in param_vec.
        '''
        param_vec = param_vec.astype('float32')
        offset = 0
        for name in ts_param_name:
            shape = ts_param_shape_map[name]
            numel = numpy.prod(shape)
            ttns.train_stack_config[name].set_value(
                param_vec[offset:offset + numel].reshape(shape))
            offset += numel
            pass
        return

    def vectorize(param_list, dtype='float32'):
        param_vec = numpy.zeros((total_param, ), dtype=dtype)
        offset = 0
        for idx, param in enumerate(param_list):
            shape = param.shape
            assert shape == ts_param_shape_list[idx]
            numel = numpy.prod(shape)
            param_vec[offset:offset + numel] = param.reshape(
                (numel, )).astype(dtype)
            offset += numel
            pass
        return param_vec

    def get_entries_in_ttns():
        ''' Set entries in ttns.train_stack_config
        with corresponding values in param_vec.
        '''
        return vectorize([
            ttns.train_stack_config[name].get_value() for name in ts_param_name
        ])

    def loss_over_corpus(param_vec):
        ''' Compute the loss value over the entire corpus.
        '''
        set_entries_in_ttns(param_vec)
        corpus_cost = 0
        for idx in range(len(train_lex)):
            input_string = train_lex[idx]

            output_string = train_y[idx]

            corpus_cost += ttns.train_f_cost(input_string, output_string)
        return corpus_cost / len(train_lex)

    def gradient_over_corpus(param_vec):
        set_entries_in_ttns(param_vec)
        corpus_grad = numpy.zeros((total_param, ), dtype='float64')
        for idx in range(len(train_lex)):
            input_string = train_lex[idx]
            output_string = train_y[idx]
            tmp_grad = ttns.train_f_grad(input_string, output_string)
            corpus_grad += vectorize(tmp_grad, 'float64')
        return corpus_grad / len(train_lex)

    with rasengan.tictoc("Training %d epoch" % training_stats['epoch_id']):
        init_param = get_entries_in_ttns()
        rasengan.warn('Skipped FD Check')
        # print 'Check grad output: Error=', scipy.optimize.check_grad(func=loss_over_corpus, grad=gradient_over_corpus, x0=init_param)
        opt_param = scipy.optimize.fmin_l_bfgs_b(loss_over_corpus,
                                                 init_param,
                                                 fprime=gradient_over_corpus,
                                                 disp=2,
                                                 maxiter=1000)[0]
        set_entries_in_ttns(opt_param)
    return
Esempio n. 34
0
   methods like ADAGrad can handle it.
8. I need to do mean normalization.
'''
if __name__ == '__main__':
    import argparse
    arg_parser = argparse.ArgumentParser(description='')
    arg_parser.add_argument('--seed', default=0, type=int)
    arg_parser.add_argument(
        '--config',
        type=str,
        default='Mvlsa@intermediate_dim~300@view_transform~SQROOT@mean_center~0'
    )
    arg_parser.add_argument('--I',
                            default=config.TREC_WEB_N_ENTITIES,
                            type=int)
    arg_parser.add_argument('--fn',
                            default=config.TREC_WEB_HIT_LIST_NPZ,
                            type=str)
    arg_parser.add_argument('--test', default=0, type=int)
    args = arg_parser.parse_args()
    import random
    random.seed(args.seed)
    numpy.random.seed(args.seed)
    out_fn = os.path.join(config.TREC_WEB_STORAGE, args.config)
    if args.test:
        args.fn = None
    G = embed(args.config, args.I, args.fn, save_intmdt_fn=out_fn + '.AT_arr')
    with tictoc('Pickling G'):
        with open(out_fn, 'wb') as f:
            numpy.save(f, G, allow_pickle=False)