def get_sparse_matrix(train=None, test=None, params=None, remove_numbers_function=True, debug=True, save=False, load=True, data_dir="data"): """ Get sparse matrix form of the train and test set Parameters ------------------------- Each input is numpy array: train, test, params: See the documentation of tf_idf function save: To save the train and test sprse matrices on . npz format load: To load the train and test sprse matrices from your local machine data_dir: Specify the ro specifi the data directory where the matrices are saved Returns: -------------------------- train: train set in sparce marix form test: test set in sparce matrix form Example ------- >>> train = pd.read_csv("data/train.csv") >>> test = pd.read_csv("data/test.csv") >>> # to create and save the train and test set >>> train_sparse, test_sparse = get_sparse_matrix(train, test, params=None, remove_numbers_function=True, debug=True, save=True, load=False) >>> # to load the sparse matrices from your local machine >>> train, test = get_sparse_matrix(load=True) """ base_dir = data_dir + '/output/' if not os.path.exists(base_dir): os.makedirs(base_dir) name_train = base_dir + 'sparce_train.npz' name_test = base_dir + 'sparce_test.npz' if load: if os.path.exists(name_train) and os.path.exists(name_test): train, test = load_sparse_csr(name_train), load_sparse_csr(name_test) else: raise ValueError("You asked to load the features but they were not found" + "at the specified location: \n{}\n{}".format(name_train, name_test)) else: print('Computing the sparse matrixes, this will take a while...!') train, test = tf_idf(train, test, params, remove_numbers_function, debug) if save: print('Saving train file as {}'.format(name_train)) save_sparse_csr(name_train, train) print('Saving test file as {}'.format(name_test)) save_sparse_csr(name_test, test) return train, test
def calc_weighted_modularity_langs(A_filename, indice2word_filename, langs): """ :param A_filename: :param indice2word_filename: :param langs: prefix tags for each word in the .txt embedding file :return: normalized modularity """ A = load_sparse_csr(A_filename).todok() indice2word = pickle.load(open(indice2word_filename, "rb")) num_community = len(langs) a_l = [0] * num_community e_ll = [0] * num_community e_ll_Q_max = [0] * num_community lang_to_indice = {lang: i for i, lang in enumerate(langs) } # assigns a community ID for each language degree_list, two_m = get_degrees(A) """ a_l computation """ for node_id in range(A.shape[0]): node_word = indice2word[node_id] community = lang_to_indice[node_word[:3]] k_i = degree_list[node_id] a_l[community] += k_i """ e_ll computation """ for processed, e in enumerate(A.items()): node1, node2 = e[0] node1_word = indice2word[node1] node2_word = indice2word[node2] community_id = lang_to_indice[node1_word[:3]] e_ll_Q_max[community_id] += 1 # 1 is maximum possible weight if node1_word[:3] == node2_word[:3]: e_ll[community_id] += A[node1, node2] check_e_ii_s_Q_max = 0 for i in range(len(a_l)): check_e_ii_s_Q_max += e_ll_Q_max[i] assert (check_e_ii_s_Q_max == two_m) e_ll = list(map(lambda x: x / two_m, e_ll)) e_ll_Q_max = list(map(lambda x: x / two_m, e_ll_Q_max)) a_l = list(map(lambda x: x / two_m, a_l)) print("mean degree=%.4f" % (np.mean(degree_list))) Q = 0 Q_max = 0 for i in range(len(a_l)): Q += e_ll[i] - (a_l[i]**2) Q_max += e_ll_Q_max[i] - (a_l[i]**2) print("Weighted Q=%.3f" % Q) print("Weighted Q_max=%.3f" % Q_max) print("Normalized Q=%.3f" % (Q / Q_max)) return Q / Q_max
def __init__(self, db_path=DB_PATH, mat_path=MAT_PATH): self.db_path = db_path self.mat_path = mat_path self.db = DocDB(db_path=self.db_path) self.mat, metadata = utils.load_sparse_csr(self.mat_path) # doc_freqs, hash_size, ngram, doc_dict for k, v in metadata.items(): setattr(self, k, v)
def __init__(self, strict=True, num_workers=8, tokenize=False, topn=100): """ Args: tfidf_path: path to saved model file strict: fail on empty queries or continue (and return empty result) """ # Load from disk self.topn = topn self.tokenize = tokenize self.num_workers = num_workers tfidf_path = DATA_DIR + "/corpus-tfidf-ngram=1-hash=16777216-tokenizer=spacy.npz" logger.info('Loading %s' % tfidf_path) matrix, metadata = utils.load_sparse_csr(tfidf_path) self.doc_mat = matrix self.ngrams = metadata['ngram'] self.hash_size = metadata['hash_size'] self.tokenizer = tokenizers.get_class(metadata['tokenizer'])() self.doc_freqs = metadata['doc_freqs'].squeeze() self.doc_dict = metadata['doc_dict'] self.num_docs = len(self.doc_dict[0]) self.strict = strict
def initCosSimQuery(): global term2id global invertedIndex global W global docCount global termCount global idf term2tidFile = open("term2tid.json", "r") indexFile = open("invertedIndex.json", "r") term2id = json.load(term2tidFile) invertedIndex = json.load(indexFile) term2tidFile.close() indexFile.close() W = utils.load_sparse_csr("weightMatrix.npz") idf = np.load('idf.npy') docCount = np.size(W, 1) termCount = len(term2id)
def _load_arrays(self, fn): zero_fn = fn + '.npz' bin_fn = fn + '.bin.npz' self.zero_sparse = load_sparse_csr(zero_fn) self.binary_sparse = load_sparse_csr(bin_fn)
print( 'Loading the paragraph trained classifier trained on data processed by ' 'biggest gap thresholding mechanism ') classifier = load_pickle(config.classifier_par_biggest_gap) threshold = 0.91 y_true = process_y(data, threshold_biggest_gap) else: print( 'Loading the paragraph trained classifier trained on data processed by' ' threshold_half_max function') classifier = load_pickle(config.classifier_par_half_max) threshold = 0.39 y_true = process_y(data, threshold_half_max) print("Loading x") x = load_sparse_csr(data['x']) else: threshold = 0.3 vectorizer = load_pickle(config.vectorizer) binarizer = load_pickle(config.binarizer) print('Loading the classifier') classifier = load_pickle(config.classifier) corpus, topics = build_corpus_and_topics(config.data['test']) print('Transforming corpus by vectorizer') x = vectorizer.transform(corpus) print('Transforming article topics by binarizer') y_true = binarizer.transform(topics)
def load_active_during_test(): items_data = pd.read_csv("data_modified/item_profile.csv", delim_whitespace=True, dtype={'id': int, 'active_during_test': str}, usecols=['id', 'active_during_test']) return set(items_data[items_data.active_during_test == '1']['id'].values) active_items = load_active_during_test() # load data from csv interactions_map = load_interactions() urm_user = load_sparse_csr('urm_user_based_full.npz') urm_item = load_sparse_csr('urm_item_based_full.npz') urm_funk = load_sparse_csr('urm_funk_full.npz') urm_user = normalize(urm_user) urm_item = normalize(urm_item) urm_funk = normalize(urm_funk) # 0.024365000000000008 0.12 0.24 alpha = 0.3 beta = 0.3 # for beta in np.arange(start=0.28, stop=10, step=0.01): estimated_urm = urm_item * alpha + urm_user * beta + urm_funk * (1.0 - alpha - beta) # write recommendations
parser = argparse.ArgumentParser() parser.add_argument('ct_path', type=str, default=None, help='Path to count matrices') parser.add_argument('out_dir', type=str, default=None, help='Directory for saving output files') args = parser.parse_args() ct_files = [f for f in utils.iter_files(args.ct_path)] logger.info('Loading the zeroth count matrix...') mat, metadata = utils.load_sparse_csr(ct_files[0]) DOC2IDX, doc_ids = metadata['doc_dict'] for i in range(1, len(ct_files)): logger.info('Loading %ith count matrix...' % i) nxt_mat, nxt_metadata = utils.load_sparse_csr(ct_files[i]) if metadata['hash_size'] != nxt_metadata['hash_size']: raise RuntimeError('hash_size not equal in %ith file' % i) if metadata['ngram'] != nxt_metadata['ngram']: raise RuntimeError('ngram not equal in %ith file' % i) logger.info('Merging...') mat = sp.hstack([mat, nxt_mat])
default=None, help='Path to count matrix file') parser.add_argument('out_dir', type=str, default=None, help='Directory for saving output files') parser.add_argument('--model', type=str, default='tfidf', help=('tfidf or pmi')) args = parser.parse_args() logger.info('Loading count matrix...') count_matrix, metadata = utils.load_sparse_csr(args.ct_path) logger.info('Making %s vectors...' % args.model) if args.model == 'tfidf': mat = get_tfidf_matrix(count_matrix) elif args.model == 'pmi': mat = get_pmi_matrix(count_matrix) else: raise RuntimeError('Model %s is invalid' % args.model) basename = os.path.splitext(os.path.basename(args.ct_path))[0] basename = ('%s-' % args.model) + basename if not os.path.exists(args.out_dir): logger.info("Creating data directory")
import utils import numpy as np from scipy import sparse from sklearn.externals import joblib #root to this code is: /Google Drive/CIKM_AnalytiCup_2017/Code/CIKM_AnalytiCup_2017/ filename = 'features/features_title_desc_attributes_1_3_word_1_5_char.npz' X = utils.load_sparse_csr(filename) XX = utils.load_sparse_csr(filename.split('.')[-2] + '_valid.npz') loaded = np.load(filename.split('.')[-2] + '_dict.npz') words = loaded['words'].tolist() chars = loaded['chars'].tolist() feature_names = [ 'title.getLengths', 'title.getLengthsByTerm', 'title.getSpecialCharStat', 'title.hasSpecialChar', 'title.getUpperCharStat', 'title.getNounStat', 'title.getVerbStat', 'title.getAdjectiveStat', 'title.hasNumber', 'title.getNumberStat', 'title.hasTamilChar', 'title.hasChineseChar', 'title.getNonEnglishCharStat', 'title.getColorStat', 'title.getBrandStat(brands)', 'title.getSyllableStat', 'title.getPolysyllabStat',