def generate_training_validating_rt(version, r_to_i, u_to_i, r_u_t_fn, split, is_test=False): """Function called to generate training.mtx, validating.mtx and recommendation_times.npy """ if is_test: data_processed_dir = join(PROCESSED_DATA_DIR, "test") else: data_processed_dir = PROCESSED_DATA_DIR u_r_times = mmread(r_u_t_fn).transpose().tolil() nu, nr = u_r_times.shape training_matrix = lil_matrix((nu,nr), dtype=np.int_) validating_matrix = lil_matrix((nu,nr), dtype=np.int_) recommendation_times = np.zeros(nu, dtype=np.int_) valid_repositories_table = version+"_repositories" cursor = getDB(is_test=is_test).cursor() for uidx in xrange(nu): cursor.execute("""SELECT vr.id FROM repositories as r, {} as vr WHERE vr.id = r.id AND r.owner_id = %s """.format(valid_repositories_table), (u_to_i.r(uidx),)) owned_rs = np.array([r_to_i[r[0]] for r in cursor]) interests = u_r_times.getrowview(uidx) interested_rs = np.unique(interests.tocoo().col) ext_rs = np.setdiff1d(interested_rs, owned_rs, assume_unique=True) times = interests[0,ext_rs].toarray()[0] sorted_indices = times.argsort() threshold = int(floor(split*len(ext_rs))) training = [ext_rs[i] for i in sorted_indices[:threshold]] threshold_time = times[sorted_indices[threshold]] training += [r for r in owned_rs if interests[0,r] < threshold_time] validating = [ext_rs[i] for i in sorted_indices[threshold:]] for t in training: training_matrix[uidx,t] = 1 for v in validating: validating_matrix[uidx,v] = 1 recommendation_times[uidx] = threshold_time comment=""" Training interests are before validating interests. The split is as follows: Training: all internals before first last 1/3 externals + first 2/3 externals Testing: last 1/3 externals""" version_dir = join(data_processed_dir, version) tfn = join(version_dir, TRAINING_FN) vfn = join(version_dir, VALIDATING_FN) rtfn = join(version_dir, RECOMMENDATION_TIMES_FN) mmwrite(tfn, training_matrix, comment=comment) mmwrite(vfn, validating_matrix, comment=comment) np.save(rtfn, recommendation_times) return (tfn, vfn, rtfn)
def gen_app_pop_count(dev_app, ga_train, ga_test, base_dir='/data'): start_time = time.time() print('generating popularity weighted app count per device') app_popularity = dev_app.groupby(['app_id'])['device_id'].agg( {'popularity': lambda x: x.nunique()}) app_pop_count = dev_app.groupby(['device_id'])['app_id'].agg( {'app_pop_count': lambda x: app_popularity.loc[x.unique(), 'popularity'].sum()}) app_count_train = ga_train['device_id'].map( app_pop_count['app_pop_count']).fillna(0) app_count_train = app_count_train / app_count_train.max() app_count_train = csr_matrix(app_count_train.values).transpose() app_count_test = ga_test['device_id'].map(app_pop_count['app_pop_count']).fillna(0) app_count_test = app_count_test / app_count_test.max() app_count_test = csr_matrix(app_count_test.values).transpose() print('train set shape: ', app_count_train.shape) io.mmwrite(base_dir + "train_apppopcount.mtx", app_count_train) print('test set shape: ', app_count_test.shape) io.mmwrite(base_dir + "test_apppopcount.mtx", app_count_test) print('Time generating app pop count: ', (time.time() - start_time) / 60)
def encode() : """ Generate extra features from pairs, triplets, and common quadruplets of the existing features and then save those features in a sparse matrix to disk. """ dftrain = load_dataframe('train') dftest = load_dataframe('test') lentrain = len(dftrain) all_data = np.vstack((dftrain.ix[:,1:-1], dftest.ix[:,1:-1])) np.array(dftrain.ACTION).dump('{}/train_truth.dat'.format(ddir)) dp = group_data(all_data, degree=2, remove_unique=True) dt = group_data(all_data, degree=3, remove_unique=True) dq = group_data(all_data, degree=4, remove_unique=True) dq = remove_rare(dq, 15) X = all_data[:lentrain] X_2 = dp[:lentrain] X_3 = dt[:lentrain] X_4 = dq[:lentrain] X_train_all = np.hstack((X, X_2, X_3, X_4)) mmwrite('{}/train_encoded'.format(ddir), X_train_all) X_test = all_data[lentrain:] X_test_2 = dp[lentrain:] X_test_3 = dt[lentrain:] X_test_4 = dq[lentrain:] X_test_all = np.hstack((X_test, X_test_2, X_test_3, X_test_4)) mmwrite('{}/test_encoded'.format(ddir), X_test_all)
def build_nontext_vector(fin, colname, colidx, normalize): """ Handles the specified column as a categorical variable. """ print "Building category vector for %s" % (colname) fout = str.replace(fin, ".csv", "." + colname + ".mtx") if os.path.isfile(fout): return ftmp = str.replace(fin, ".csv", ".tmp") reader = csv.reader(open(fin, 'rb')) tmpwriter = open(ftmp, 'wb') ln = 0 for row in reader: ln += 1 if ln <= 1: continue if ln % 1000 == 0: print "...(processed %d lines)" % (ln) colval = str.lower(row[colidx]) if normalize: colval = str.replace(colval, " ", "_") if len(colval.rstrip()) == 0: colval = "UNK" tmpwriter.write(colval + "\n") tmpwriter.close() tmpreader = open(ftmp, 'rb') vectorizer = sft.CountVectorizer(max_features=100) catmatrix = vectorizer.fit_transform(tmpreader) os.remove(ftmp) writer = open(fout, 'wb') sio.mmwrite(writer, catmatrix) writer.close()
def __init__(self, programEntities): nusers = len(programEntities.userIndex.keys()) self.numFriends = np.zeros((nusers)) self.userFriends = ss.dok_matrix((nusers, nusers)) fin = open("../Data/user_friends.csv", 'rb') fin.readline() # skip header ln = 0 for line in fin: # if ln % 100 == 0: # print "Loading line: ", ln cols = line.strip().split(",") user = cols[0] if programEntities.userIndex.has_key(user): friends = cols[1].split(" ") i = programEntities.userIndex[user] self.numFriends[i] = len(friends) for friend in friends: if programEntities.userIndex.has_key(friend): j = programEntities.userIndex[friend] # the objective of this score is to infer the degree to # and direction in which this friend will influence the # user's decision, so we sum the user/event score for # this user across all training events. eventsForUser = programEntities.userEventScores.getrow(j).todense() score = eventsForUser.sum() / np.shape(eventsForUser)[1] self.userFriends[i, j] += score self.userFriends[j, i] += score ln += 1 fin.close() # normalize the arrays sumNumFriends = self.numFriends.sum(axis=0) self.numFriends = self.numFriends / sumNumFriends sio.mmwrite("../Models/UF_numFriends", np.matrix(self.numFriends)) self.userFriends = normalize(self.userFriends, norm="l1", axis=0, copy=False) sio.mmwrite("../Models/UF_userFriends", self.userFriends)
def __init__(self, programEvents): nevents = len(programEvents.eventIndex.keys()) self.eventPopularity = ss.dok_matrix((nevents, 5)) self.eventAttendees = collections.defaultdict(list) f = open("/users/chaitanya/PyCharmProjects/EventRec/data/event_attendees.csv", 'rb') f.readline() # skip header for line in f: cols = line.strip().split(",") eventId = cols[0] if programEvents.eventIndex.has_key(eventId): i = programEvents.eventIndex[eventId] self.eventPopularity[i, 0] = len(cols[1].split(" ")) - len(cols[4].split(" ")) # number of yes-no self.eventPopularity[i, 1] = len(cols[3].split(" ")) # number of invited folks self.eventAttendees[i].append(cols[1].split(" ")) #list of yes folks self.eventAttendees[i].append(cols[2].split(" ")) #list of no folks self.eventAttendees[i].append(cols[3].split(" ")) #list of invited folks f.close() self.eventPopularity = normalize(self.eventPopularity, norm="l1",axis=0, copy=False) sio.mmwrite("/users/chaitanya/PyCharmProjects/EventRec/Models/EA_eventPopularity", self.eventPopularity) cPickle.dump(self.eventAttendees, open("/users/chaitanya/PyCharmProjects/EventRec/Models/PE_eventAttendees.pkl", 'wb'))
def genJaccard(feature_matrix): jaccard_matrix_pre = [] #jaccard_matrix_pre is a list of arrays that contain non-zero indicies of each article in the corpus for i in feature_matrix[0:test_num]: indicies = np.flatnonzero(i) jaccard_matrix_pre.append(indicies) S=sparse.dok_matrix((test_num, test_num)) t0=time.time() numi=0 for i in jaccard_matrix_pre: jnum=0 for j in jaccard_matrix_pre[0:numi+1]: #decrease number of calculations to n choose 2 instead of n^2 diviser = float(len(set(i).union(set(j)))) if diviser != 0: actual_jaccard = float(len(set(i).intersection(set(j))))/diviser if actual_jaccard != 0 and actual_jaccard!=1: S[numi,jnum] = actual_jaccard jnum=jnum+1 numi = numi+1 with open('pickled_minhash/actual_jaccard_matrix_small.mtx', 'wb') as f: #size of feature_matrix_large: 1261 x 19043 io.mmwrite(f, S) print("TIME to generate jaccard_matrix: {}".format(time.time()-t0))
def store_matrix(matrix='', output_dir_path='', out_file_name='', output_format=''): """store_matrix.""" if not os.path.exists(output_dir_path): os.mkdir(output_dir_path) full_out_file_name = os.path.join(output_dir_path, out_file_name) if output_format == "MatrixMarket": if len(matrix.shape) == 1: raise Exception( "'MatrixMarket' format supports only 2D dimensional array\ and not vectors") else: io.mmwrite(full_out_file_name, matrix, precision=None) elif output_format == "numpy": np.save(full_out_file_name, matrix) elif output_format == "joblib": joblib.dump(matrix, full_out_file_name) elif output_format == "text": with open(full_out_file_name, "w") as f: if len(matrix.shape) == 1: for x in matrix: f.write("%s\n" % (x)) else: raise Exception( "'text' format supports only mono dimensional array\ and not matrices") logger.info("Written file: %s" % full_out_file_name)
def get_content_similarity_scores(readmes, dataset_dir, profile="tfidf", similarity="cos"): """Return CSR matrix of similarity_{r,r} for all r in `readmes`. `dataset_dir` the directory where the similarity scores are `profile` bool or tfidf `similarity` cos or ijd (inverse Jacquard Distance) """ if profile == "tfidf": sim_fn = join(dataset_dir, TF_IDF_FN) if exists(sim_fn): return mmread(sim_fn).tocsr() if profile == "bool": #readme_words = COUNTVECTORIZER readmes pass else: tfidf = TfidfVectorizer(input='file', #sublinear_tf=True, max_df=0.5, stop_words='english', decode_error="ignore") #max_df=0.5: if a word occurs in more than half of the readmes it is # ignored readme_words = tfidf.fit_transform(readmes) if similarity == "cos": similarity_scores = csr_matrix(cosine_similarity(readme_words)) else: # similarity_scores = csr_matrix(ijd(readme_words)) pass mmwrite(sim_fn, similarity_scores, comment=profile+"_"+similarity+"_similarity_{r,r}") return similarity_scores
def SOP(alpha, teta, nbBasket, nbReco): data = load() ############################################################### # CREATE MODELS ############################################################### print 'Create the model based on the training set' modelSOP = processing.SOPRecoModel(data.getUserItemMatrix(), alpha, teta) modelSOP.launch() ############################################################### # SET RECOMMENDATION ############################################################### if nbBasket == -1: evalSOP = processing.Evaluation(modelSOP, data.getBasketItemList(), nbReco) else : evalSOP = processing.Evaluation(modelSOP, data.getBasketItemList()[:nbBasket], nbReco) ############################################################### # LAUNCH RECOMMENDATION + SAVE RESULTS ############################################################### t = time.time() evalSOP.newEval() SOPTime = time.time()-t mmwrite('SOPPerf_a%s_t%s_nb%s_nr%s'%(alpha,teta,nbBasket,nbReco),evalSOP.perf) print 'SOP Execution time:', SOPTime print 'Performances : ' print evalSOP.testNames print evalSOP.meanPerf() evalSOP.savePerf('SOPPerf_a%s_t%s_nb%s_nr%s.txt'%(alpha,teta,nbBasket,nbReco)) return evalSOP
def RWWR(alpha, nbBasket, nbReco): data = load() ############################################################### # CREATE MODELS ############################################################### print 'Create the model based on the training set' modelRWWR = processing.RandomWalkWithRestartRecoModel(data.getUserItemMatrix(), alpha) ############################################################### # SET RECOMMENDATION ############################################################### if nbBasket == -1: evalRWWR = processing.Evaluation(modelRWWR, data.getBasketItemList(), nbReco) else : evalRWWR = processing.Evaluation(modelRWWR, data.getBasketItemList()[:nbBasket], nbReco) ############################################################### # LAUNCH RECOMMENDATION + SAVE RESULTS ############################################################### t = time.time() evalRWWR.newEval() RWWRTime = time.time()-t mmwrite('RWWR_a%s_nb%s'%(alpha, nbBasket),evalRWWR.perf) print 'RWWR Execution time:', RWWRTime print 'Performances :' print evalRWWR.testNames print evalRWWR.meanPerf() evalRWWR.savePerf('RWWR_a%s_nb%s'%(alpha, nbBasket)) return evalRWWR
def Cosine(nbBasket, nbReco): data = load() ############################################################### # CREATE MODELS ############################################################### print 'Create the model based on the training set' modelCosine = processing.CosineRecoModel(data.getUserItemMatrix()) ############################################################### # SET RECOMMENDATION ############################################################### if nbBasket == -1: evalCosine = processing.Evaluation(modelCosine, data.getBasketItemList(), nbReco) else : evalCosine = processing.Evaluation(modelCosine, data.getBasketItemList()[:nbBasket], nbReco) ############################################################### # LAUNCH RECOMMENDATION + SAVE RESULTS ############################################################### t = time.time() evalCosine.newEval() CosineTime = time.time()-t mmwrite('Cosine_nb%s'%nbBasket,evalCosine.perf) print 'Cosine Execution time:', CosineTime print 'Performances :' print evalCosine.testNames print evalCosine.meanPerf() evalCosine.savePerf('Cosine_nb%s.txt'%nbBasket) return evalCosine
def make_doc_vectors(fname_pat, out_fname): fnames = glob(fname_pat) labels = [splitext(basename(fn))[0] for fn in fnames] stop_words = frozenset(list(ENGLISH_STOP_WORDS) + OTHER_STOPWORDS) vectorizer = CountVectorizer(input="filename", ngram_range=(1,3), min_df=5, max_df=0.7, stop_words=stop_words, token_pattern=r"(?u)\b[A-Za-z]\w+\b") vectors = vectorizer.fit_transform(fnames) log.info("saving matrix in Numpy format to " + out_fname) np.savez(out_fname, vectorizer=vectorizer, vectors=vectors, labels=labels) base_fname = splitext(out_fname)[0] mm_fname = base_fname + ".mtx" log.info("saving matrix in Matrix Market format to " + mm_fname) mmwrite(mm_fname, vectors, "IDIScape document vectors", "integer") feat_fname = base_fname + "_features.txt" log.info("saving features to " + feat_fname) feat_names = vectorizer.get_feature_names() open(feat_fname, "w", "utf8").write(u"\n".join(feat_names)) label_fname = base_fname + "_labels.txt" log.info("saving labels to " + label_fname) open(label_fname, "w", "utf8").write(u"\n".join(labels))
def nearest_neighbor_degree_analysis(self, target, method="online"): """nearest neighbors' degree ref Empirical analysis of web-based user-object bipartite networks, Ming-Sheng Shang et al. 17 June 2010. target = 'user' means nearest neighbors' degree for users, target = 'item' means nearest neighbors' degree for items. method = 'online' means online calculation, method = 'offline' means using offline results. """ filepath = "./offline_results/nn_degree" if method == "online":# online calculation tinynum = 0.00000001 if target == "user":# for user self.ui_matrix = self.ui_matrix.tocsc() degree = sparse.csc_matrix(self.ui_matrix.sum(0)) # degree = degree + sparse.csc_matrix(np.ones([1, self.usernum]))*tinynum# to avoid zero division nn_degree = sparse.csc_matrix(self.ui_matrix.sum(1).transpose())\ .dot(self.ui_matrix)/degree elif target == "item":# for item self.ui_matrix = self.ui_matrix.tocsr() degree = sparse.csr_matrix(self.ui_matrix.sum(1)) # degree = degree + sparse.csr_matrix(np.ones([self.itemnum, 1]))*tinynum# to avoid zero division nn_degree = self.ui_matrix.dot(sparse.csr_matrix(self.ui_matrix.sum(0).transpose()))/degree else: print "target arg error !" sys.exit() if target == "user" or target == "item": try: io.mmwrite(filepath+"_%s"%target, nn_degree) except Exception,e: print e sys.exit() return nn_degree
def run(self, ratio, input_db, output_mat): db = sqlite3.connect(input_db) # assume no empty users users = db.execute("""SELECT Users.[Id] FROM Users""").fetchall() # pick <ratio> of them for training db, pick <ratio/10> of them for test db train_ids = [] test_ids = [] test_threshold = ratio/10 train_threshold = test_threshold + ratio for u in users: rnd = random.random() if (rnd <= test_threshold): test_ids.append(u[0]) elif (rnd <= train_threshold): train_ids.append(u[0]) train_matrix = self.data_to_matrix(db, train_ids).tocsc() test_matrix = self.data_to_matrix(db, test_ids).tocsc() (train_matrix, test_matrix) = self.trim_matrices(train_matrix, test_matrix) savemat(output_mat, {'train' : train_matrix,'test' : test_matrix}, oned_as = 'row') mmwrite(output_mat + '.train', train_matrix) mmwrite(output_mat + '.test', test_matrix) print("Done!")
def main(): """ Main entry point to script to perform kmeans. Returns: - `0` or `1` on success or failure respectively. - Saves `centroids`, `centroiddict`, and `clusters` in working dir. """ parser = gen_args() args = parser.parse_args() sessionid = args.sessionid data = spio.mmread(args.data).tocsc() logger = logging.getLogger(__name__) logger.addHandler(logging.StreamHandler()) if args.verbose: logger.setLevel(logging.DEBUG) if args.k: k = args.k kmeans = KMeans(data, k, args.n, args.delta, args.randomcentroids, \ args.classical, args.verbose) result = kmeans.run() clusters = result['clusters'] centroids = result['centroids'] centroiddict = result['centroiddict'] cPickle.dump(clusters, open("data_clusters_" + sessionid + '.pck', 'w')) cPickle.dump(centroiddict, open("centroid_dict_" + \ sessionid + '.pck', 'w')) spio.mmwrite(open("data_centroids_" + sessionid + '.mtx', 'w'), \ centroids, comment="CSC Matrix", field='real') logger.info(" %d Clusters Generated ", len(clusters)) return 0
def __init__(self, programEntities, sim=ssd.correlation): cleaner = DataCleaner() nusers = len(programEntities.userIndex.keys()) fin = open("../Data/users.csv", 'rb') colnames = fin.readline().strip().split(",") self.userMatrix = ss.dok_matrix((nusers, len(colnames) - 1)) for line in fin: cols = line.strip().split(",") # consider the user only if he exists in train.csv if programEntities.userIndex.has_key(cols[0]): i = programEntities.userIndex[cols[0]] self.userMatrix[i, 0] = cleaner.getLocaleId(cols[1]) self.userMatrix[i, 1] = cleaner.getBirthYearInt(cols[2]) self.userMatrix[i, 2] = cleaner.getGenderId(cols[3]) self.userMatrix[i, 3] = cleaner.getJoinedYearMonth(cols[4]) self.userMatrix[i, 4] = cleaner.getCountryId(cols[5]) self.userMatrix[i, 5] = cleaner.getTimezoneInt(cols[6]) fin.close() # normalize the user matrix self.userMatrix = normalize(self.userMatrix, norm="l1", axis=0, copy=False) sio.mmwrite("../Models/US_userMatrix", self.userMatrix) # calculate the user similarity matrix and save it for later self.userSimMatrix = ss.dok_matrix((nusers, nusers)) for i in range(0, nusers): self.userSimMatrix[i, i] = 1.0 for u1, u2 in programEntities.uniqueUserPairs: i = programEntities.userIndex[u1] j = programEntities.userIndex[u2] if not self.userSimMatrix.has_key((i, j)): usim = sim(self.userMatrix.getrow(i).todense(), self.userMatrix.getrow(j).todense()) self.userSimMatrix[i, j] = usim self.userSimMatrix[j, i] = usim sio.mmwrite("../Models/US_userSimMatrix", self.userSimMatrix)
def main(argv): inputfile = '' outputfile = '' try: opts, args = getopt.getopt(argv,"hi:o:",["ifile=","ofile="]) except getopt.GetoptError: print 'python <script name> -i <inputfile> -o <outputfile>' sys.exit(2) for opt, arg in opts: if opt == '-h': print 'python <script name> -i <inputfile> -o <outputfile>' sys.exit() elif opt in ("-i"): inputfile = arg elif opt in ("-o"): outputfile = arg print 'Reading images from "', inputfile print 'Writing vectors to "', outputfile input_file_name = inputfile """ ################### START PROGRAM ############################ """ print "--------------STRAT-----------------" running_time = time.now() data_matrix = process(input_file_name) print "Total time:" + str(time.now() - running_time) print "Writing to file." io.mmwrite(outputfile, data_matrix)
def test4(): n = 3 ; p = 2 geo = periodic_square(n=[n,n], p=[p,p]) geo_ref, list_lmatrices = geo.bezier_extract() from scipy.io import mmwrite M = list_lmatrices[0][0] mmwrite("M_conversion_test_4.mtx", M)
def RW_POP(alpha, nbBasket, nbReco): data = load() ############################################################### # CREATE MODELS ############################################################### print 'Create the model based on the training set' modelRW = processing.BasketRandomWalk_POP(data.getUserItemMatrix(), alpha) ############################################################### # SET RECOMMENDATION ############################################################### if nbBasket == -1: evalRW = processing.Evaluation(modelRW, data.getBasketItemList(), nbReco) else : evalRW = processing.Evaluation(modelRW, data.getBasketItemList()[:nbBasket], nbReco) ############################################################### # LAUNCH RECOMMENDATION + SAVE RESULTS ############################################################### t = time.time() evalRW.newEval() RWTime = time.time()-t mmwrite(resultFolder+'RW_POP_a%s_nb%s'%(alpha, nbBasket),evalRW.perf) print 'RW_POP Execution time:', RWTime print 'Performances :' print evalRW.testNames print evalRW.computePerf() evalRW.savePerf(resultFolder+'RW_POP_a%s_nb%s.txt'%(alpha, nbBasket)) return evalRW
def save_results(results, name, version="1_1"): expt_action_mean, expt_action_std, \ expt_reward_mean, expt_reward_std, epsilon = results if expt_action_mean is not None: mmwrite("./out/ex%s_%s_action_mean.mtx" % (version, name.lower()), expt_action_mean, "Experiment %s %s actions mean." % (version, name)) if expt_action_std is not None: mmwrite("./out/ex%s_%s_action_std.mtx" % (version, name.lower()), expt_action_std, "Experiment %s %s actions SD." % (version, name)) if expt_reward_mean is not None: mmwrite("./out/ex%s_%s_reward_mean.mtx" % (version, name.lower()), expt_reward_mean, "Experiment %s %s rewards mean." % (version, name)) if expt_reward_std is not None: mmwrite("./out/ex%s_%s_reward_std.mtx" % (version, name.lower()), expt_reward_std, "Experiment %s %s rewards SD." % (version, name)) if epsilon is not None: mmwrite("./out/ex%s_%s_epsilon.mtx" % (version, name.lower()), epsilon, "Experiment %s %s exploration rates." % (version, name))
def buildBaseDB(self, verticesPath): mainDir = os.path.dirname(verticesPath) verticesFileName = os.path.basename(verticesPath) targetsDBPath = os.path.join(mainDir,self.targetDBFolder) if "_vertices" not in verticesFileName: print "Error: the vertices file name must have the form of *_vertices.dat" return if not os.path.isdir(targetsDBPath): print "Error: targets folder %s not found"%(targetsDBPath) return base_file = os.path.join(mainDir, verticesFileName.replace("_vertices","_base")) vertices,lookup = buildbase.read_vertices(verticesPath) print "Loading targets..." rb,target_names = buildbase.load_targets(targetsDBPath,vertices,lookup) print "Making base..." base,back = buildbase.make_base(rb,self.cutOff) del rb print "Saving base..." f = open(base_file,'w') mmwrite(f,base) f.close() del base print "Saved in %s"%(base_file)
def make_author_vectors(crawl_fname, doc_vec_fname, auth_vec_fname): docs = np.load(doc_vec_fname) doc_vecs = docs["vectors"][()] # Convert to LIL, because modifying CSR is slow doc_vecs = doc_vecs.tolil() # Create mapping from label (=DOI) to row number (=doc vector) doi2n = dict((l,i) for i,l in enumerate(docs["labels"])) # Collect authors tree = etree.parse(crawl_fname) authors = np.array(list(set(tree.xpath("//author/text()")))) # Create empty author vectors shape = (len(authors), doc_vecs.shape[1]) auth_vecs = sp.lil_matrix(shape) # Create mapping from authors to row number (=author vector) auth2n = dict((a,i) for i,a in enumerate(authors)) ## author to group mapping ##auth2group = {} # Fill author vectors by adding doc vectors for item in tree.findall("//item"): author = item.find("author").text ##group = item.find("group") ##auth2group[author] = group url = item.find("url").text query = urlparse.urlparse(url).query doi = urlparse.parse_qs(query)["doi"][0] log.debug(u"DOI={} author={}".format(doi, author)) try: auth_vecs[auth2n[author]] += doc_vecs[doi2n[doi]] except KeyError: log.warning(u"No document with DOI={} for author {}".format( doi, author)) auth_vecs = auth_vecs.tocsr() ##group_labels = [auth2group[auth] for auth in authors] log.info("saving matrix in Numpy format to " + auth_vec_fname) np.savez(auth_vec_fname, vectorizer=docs["vectorizer"], vectors=auth_vecs, author_labels=authors, ##group_labels=group_labels ) base_fname = splitext(auth_vec_fname)[0] mm_fname = base_fname + ".mtx" log.info("saving matrix in Matrix Market format to " + mm_fname) mmwrite(mm_fname, auth_vecs, "IDIScape document vectors", "integer") label_fname = base_fname + "_labels.txt" log.info("saving labels to " + label_fname) open(label_fname, "w", "utf8").write(u"\n".join(authors))
def save_new_ref(filename, data): """ Saves a new version of the reference data, and backs up the old """ ext = filename.split('.')[-1] if (data == None): print("WARNING: Error generating file: %s" % filename) print("Skipped... try again.") return if os.path.exists(filename): os.system( 'mv %s %s' % (filename, BACKUP_DIR) ) if ext in ['h5', 'lh5']: if scipy.sparse.issparse(data): data = data.toarray() Serializer.SaveData(filename, data) elif ext == 'mtx': io.mmwrite(filename, data) elif ext == 'pkl': f = open(filename, 'w') pickle.dump(f, data) f.close() else: raise ValueError('Could not understand extension (.%s) for %s' % (ext, filename)) return
def test1D2(): spl = splineRefMat(DIM_1D) # list_r = list(np.random.random(20)) list_r = [0.1,0.2,0.3] nx = 3 px = 2 geo = line(n=[nx], p=[px]) nrb = geo[0] knots = nrb.knots[0] n = nrb.shape[0] p = nrb.degree[0] P = nrb.points M = spl.construct(list_r, p, n, knots) from scipy.io import mmwrite mmwrite('M.mtx', M) R = M.dot(nrb.points[:,0]) geo = line(n=[nx], p=[px]) geo.refine(id=0, list_t=[list_r]) nrb = geo[0] P = np.asarray(nrb.points[:,0]) assert(np.allclose(P,R)) print("test1D2: OK")
def main111(): if 1: G = nx.read_edgelist(infname) print nx.info(G) # Graph adj matix A = nx.to_scipy_sparse_matrix(G) print type(A) from scipy import sparse, io io.mmwrite("Results/test.mtx", A) exit() # write to disk clustering coeffs for this graph snm.get_clust_coeff([G], 'orig', 'mmonth') # write to disk egienvalue snm.network_value_distribution([G], [], 'origMmonth') if 0: edgelist = np.loadtxt(infname, dtype=str, delimiter='\t') print edgelist[:4] idx = np.arange(len(edgelist)) np.random.shuffle(idx) subsamp_edgelist = edgelist[idx[:100]] G = nx.Graph() G.add_edges_from([(long(x), long(y)) for x, y in subsamp_edgelist]) # visualize this graph # visualize_graph(G) exit() G = nx.Graph() G.add_edges_from([(long(x), long(y)) for x, y in edgelist]) print nx.info(G) print 'Done'
def Pop(nbBasket, nbReco): data = load() ############################################################### # CREATE MODELS ############################################################### print 'Create the model based on the training set' modelPop = processing.PopRecoModel(data.getUserItemMatrix()) ############################################################### # SET RECOMMENDATION ############################################################### if nbBasket == -1: evalPop = processing.Evaluation(modelPop, data.getBasketItemList(), nbReco) else : evalPop = processing.Evaluation(modelPop, data.getBasketItemList()[:nbBasket], nbReco) ############################################################### # LAUNCH RECOMMENDATION + SAVE RESULTS ############################################################### t = time.time() evalPop.newEval() PopTime = time.time()-t mmwrite(resultFolder+'Pop',evalPop.perf) print 'Pop Execution time:', PopTime print 'Performances :' print evalPop.testNames print evalPop.computePerf() evalPop.savePerf(resultFolder+'Pop.txt') return evalPop
def process_dataset(name): prefix = 'dataset/' + name + '/' + name fea = np.loadtxt(prefix + '.fea') # transform link link_data = np.loadtxt(prefix + '.link') link_data = link_data - 1 reverse_link_data = np.append(link_data[:, 1][:,np.newaxis], link_data[:, 0][:,np.newaxis], axis=1) link_data = np.append(link_data, reverse_link_data, axis=0) weight = [1]*link_data.shape[0] num_inst = fea.shape[0] link = sparse.csr_matrix((weight, link_data.transpose()), shape=(num_inst, num_inst)) # transform label gnd = np.loadtxt(prefix + '.gnd') lb = preprocessing.LabelBinarizer() label = lb.fit_transform(gnd) label = label.astype(np.float) # use max component g = nx.Graph(link) mc = nx.connected_components(g)[0] link = link[mc, :][:, mc] label = label[mc, :] fea = fea[mc, :] # save np.save(prefix + '_fea', fea) mmwrite(prefix + '_link', link) np.save(prefix + '_label', label) np.save(prefix + '_mc', mc) return fea, link, label
def df_to_sparse (jour,df_in, Fam, col): # INPUTS: # num = jour considéré # df_in = nom du dataframe d'entrée à traiter # Fam = famille considéré # col = colonne à conserver du data frame d'entrée vers celui de sortie (si # col = 0 on prend tout le dataframe) # OUTPUT: # Il n'y a pas d'output, le fichier est enregistré sous format de sparse # matrix # FONCTION: dossier = str(jour) +'.03.2013' path_entree = 'C:\Users\lbn\Documents\\data_frames\\' path_sortie = 'C:\Users\lbn\Documents\\data_frames\\sparse_data\\' doc_entree = path_entree + dossier +'\\'+df_in + str(jour) +'.pickle' doc_sortie = path_sortie + dossier +'\\'+df_in+ str(jour)+Fam data = read_pickle(doc_entree) if type(col) == list : data = csr_matrix(data.iloc[:,col], dtype=np.int8) else : data = csr_matrix(data, dtype=np.int8) io.mmwrite(doc_sortie, data) print('Traduction effectuee de:', df_in+ str(jour)+Fam)
def CondProb(alpha, nbBasket, nbReco): data = load() ############################################################### # CREATE MODELS ############################################################### print 'Create the model based on the training set' modelCondProb = processing.CondProbRecoModel(data.getUserItemMatrix(), alpha) ############################################################### # SET RECOMMENDATION ############################################################### if nbBasket == -1: evalCondProb = processing.Evaluation(modelCondProb, data.getBasketItemList(), nbReco) else : evalCondProb = processing.Evaluation(modelCondProb, data.getBasketItemList()[:nbBasket], nbReco) ############################################################### # LAUNCH RECOMMENDATION + SAVE RESULTS ############################################################### t = time.time() evalCondProb.newEval() CondProbTime = time.time()-t mmwrite(resultFolder+'CondProb_a%s_nb%s'%(alpha, nbBasket),evalCondProb.perf) print 'Cond Prob Execution time:', CondProbTime print 'Performances :' print evalCondProb.testNames print evalCondProb.computePerf() evalCondProb.savePerf(resultFolder+'CondProb_a%s_nb%s.txt'%(alpha, nbBasket)) return evalCondProb
def process_SVD2(inputFileName, outputFileName, n, p, showError): """ Perform SVD2. """ mat, rowids = loadMatrix(inputFileName) X = mat.tocsc() ut, s, vt = sparsesvd(X, n) A = np.dot(np.dot(ut.T, np.diag(s**p)), vt) saveMatrix(A, rowids, outputFileName) mmwrite("%s.ut" % inputFileName, ut) np.savetxt("%s.s" % inputFileName, s) mmwrite("%s.vt" % inputFileName, vt) if showError: Xnorm = np.linalg.norm(X.todense(), ord='fro') Error = np.linalg.norm((X - A), ord='fro') rate = (100 * Error) / Xnorm print "Approximation Error Percentage = %f%%" % rate print "Frobenius norm of the original matrix =", Xnorm print "Frobenius norm of the error matrix =", Error pass
def peak_count_matrix(atac_inter_bed, out_prefix): # first three columns is 'chr','start','end' # last column is barcode inter_peak = pd.read_csv(atac_inter_bed, header=None, sep='\t') inter_peak.columns = ['chr', 'start', 'end' ] + [''] * (inter_peak.shape[1] - 4) + ['barcode'] #inter_peak['peak']=inter_peak['name'].apply(lambda x: x.split('/')[-1]) inter_peak['peak'] = inter_peak.apply( lambda x: x['chr'] + ':' + str(x['start']) + '-' + str(x['end']), axis=1) bc_peak_counts = inter_peak.groupby(['barcode', 'peak']).size() df = pd.DataFrame(bc_peak_counts) df.reset_index(inplace=True) new_df = df.pivot(index='peak', columns='barcode', values=0) new_df.columns.name = None new_df.index.name = None new_df = new_df.fillna(0) new_df = new_df.astype('int') mmwrite(out_prefix + 'count.mtx', csr_matrix(new_df)) np.savetxt(out_prefix + 'peaks.txt', new_df.index.values, fmt="%s") np.savetxt(out_prefix + 'barcodes.txt', new_df.columns.values, fmt="%s")
def prepare_bow_matrix(labeled) -> None: """Function to prepare the BOW matrix.""" savedir = Path(config['data']['save_path']) cvect = CountVectorizer(strip_accents='ascii', min_df=2) cvect.fit((savedir / 'train.txt').read_text().splitlines()) if labeled: savedir = savedir / 'labeled' train_dbyw = cvect.transform( (savedir / 'train.txt').read_text().splitlines(), ) valid_dbyw = cvect.transform( (savedir / 'valid.txt').read_text().splitlines(), ) test_dbyw = cvect.transform( (savedir / 'test.txt').read_text().splitlines(), ) mmwrite(str(savedir / 'train.mtx'), train_dbyw) mmwrite(str(savedir / 'valid.mtx'), valid_dbyw) mmwrite(str(savedir / 'test.mtx'), test_dbyw) with (savedir / 'vocab').open('w', encoding='utf-8') as fvpw: fvpw.write('\n'.join(cvect.vocabulary_)) with (savedir / 'mtx.flist').open('w') as flpw: flpw.write('{0}\n'.format(savedir.resolve() / 'train.mtx')) flpw.write('{0}\n'.format(savedir.resolve() / 'valid.mtx')) flpw.write('{0}\n'.format(savedir.resolve() / 'test.mtx'))
def expression_matrix(df, validated_barcodes, outdir, sample, gtf_file): matrix_10X_dir = f"{outdir}/{sample}_matrix_10X/" matrix_table_file = f"{outdir}/{sample}_matrix.tsv.gz" if not os.path.exists(matrix_10X_dir): os.mkdir(matrix_10X_dir) df.loc[:, 'mark'] = 'UB' df.loc[df['Barcode'].isin(validated_barcodes), 'mark'] = 'CB' CB_total_Genes = df.loc[df['mark'] == 'CB', 'geneID'].nunique() CB_reads_count = df.loc[df['mark'] == 'CB', 'count'].sum() reads_mapped_to_transcriptome = df['count'].sum() table = df.loc[df['mark'] == 'CB', :].pivot_table( index='geneID', columns='Barcode', values='UMI', aggfunc=len).fillna(0).astype(int) id_name = gene_convert(gtf_file) id = table.index.to_series() name = id.apply(lambda x: id_name[x]) genes = pd.concat([id, name], axis=1) genes.columns = ['gene_id', 'gene_name'] # write 10X matrix table.columns.to_series().to_csv(f'{matrix_10X_dir}/barcodes.tsv', index=False, sep='\t') genes.to_csv(f'{matrix_10X_dir}/genes.tsv', index=False, header=False, sep='\t') mmwrite(f'{matrix_10X_dir}/matrix', csr_matrix(table)) # convert id to name; write table matrix table.index = name table.index.name = "" table.to_csv(matrix_table_file, sep="\t", compression='gzip') return (CB_total_Genes, CB_reads_count, reads_mapped_to_transcriptome)
def load_graph(name): dir = 'data/' try: if isinstance(name, str): meshname = dir + name else: meshname = dir + names[name] mesh = loadmat(meshname) except IOError as e: print 'Matrix market file : %s.mtx not available...downloading' % meshname url = base % (meshname, meshname) response = urllib2.urlopen(url) graph = response.read() adj_lists = [map(int, a.split()) for a in graph.splitlines() if a] num_nodes, num_edges = adj_lists[0] vertex_degrees = [len(edges) for edges in adj_lists[1:]] node_lists = [ itertools.repeat(i, n) for i, n in zip(range(num_nodes), vertex_degrees) ] I = numpy.array(list(itertools.chain(*node_lists))) J = numpy.array(list(itertools.chain(*adj_lists[1:]))) - 1 V = numpy.ones(2 * num_edges) G = coo_matrix((V, (I, J)), shape=(num_nodes, num_nodes)) mmwrite(dir + meshname, G) G_nx = make_graph(G) pos = nx.spring_layout(G_nx, iterations=200) x = numpy.array([pos[i][0] for i in range(G.shape[0])]) y = numpy.array([pos[i][1] for i in range(G.shape[0])]) V = numpy.vstack((x, y)).T E = numpy.vstack((G.row, G.col)).T mesh = {'V': V, 'E': E} savemat(dir + meshname, mesh) return mesh
def inference(dataloader, net, criterion, opt, OutputDir): net.eval() for i, (sample_idx, annotation, adj_matrix, label, mask) in enumerate(dataloader, 0): padding = torch.zeros(opt.batchSize, opt.n_node, opt.L, opt.state_dim - opt.annotation_dim).double() init_input = torch.cat((annotation, padding), 3) if opt.cuda: adj_matrix = adj_matrix.cuda() annotation = annotation.cuda() init_input = init_input.cuda() label = label.cuda() mask = mask.cuda() adj_matrix = Variable(adj_matrix) annotation = Variable(annotation) init_input = Variable(init_input) target = Variable(label) mask = Variable(mask) output = net(init_input) output = output.argmax(axis=2)[:, :, np.newaxis] # 予測結果とラベルを保存 os.makedirs(OutputDir + "/output", exist_ok=True) for batch in range(opt.batchSize): p = output.detach().numpy()[batch] t = target[batch].numpy() m = mask[batch].numpy() mmwrite(OutputDir + "/output/pred" + str(sample_idx.numpy()[batch]), lil_matrix(p)) mmwrite(OutputDir + "/output/true" + str(sample_idx.numpy()[batch]), lil_matrix(t)) mmwrite(OutputDir + "/output/mask" + str(sample_idx.numpy()[batch]), lil_matrix(m))
def cover(socp_data, N): """stacks the socp data and partitions it into N local dicts describing constraints R <= s""" if not settings.paths['mondriaan']: raise Exception( "Please provide a path to mondriaan: settings.paths['mondriaan'] = PATH.") n = socp_data['c'].shape[0] # form the Laplacian and use pymetis to partition L = form_laplacian(socp_data) io.mmwrite("mondriaan.mtx", L) import subprocess outpath = "mondriaan.mtx-P%d" % N proc = subprocess.Popen( [settings.paths['mondriaan'], "mondriaan.mtx", str(N), "0.05"]) proc.wait() with open(outpath, "r") as f: f.readline() # ignore comments f.readline() # ignore comments # basic info about the matrix m, _, _, _ = f.readline().strip().split(" ") pstart = [] # read the starting index of the partition for i in xrange(N + 1): pstart.append(int(f.readline())) part_vert = np.zeros(int(m), dtype=np.int) count = 0 part = 0 for i in xrange(N): while count < pstart[i + 1]: (row, col, val) = f.readline().strip().split(" ") part_vert[int(row) - 1] = part count += 1 part += 1 return part_vert[n:]
def inference(dataloader, opt, OutputDir, Attribute_idx): for i, (sample_idx, annotation, adj_matrix, label, mask) in enumerate(dataloader, 0): target = Variable(label) mask = Variable(mask) output = annotation[:, :, -1][:, :, Attribute_idx] output = output.argmax(axis=2)[:, :, np.newaxis] for batch in range(opt.batchSize): ts = int(sample_idx[batch].numpy()) output[batch][pred_binary[ts] > threshold] = torch.LongTensor( pred_transfer[ts][pred_binary[ts] > threshold]) # 予測結果とラベルを保存 os.makedirs(OutputDir + "/output", exist_ok=True) for batch in range(opt.batchSize): p = output.detach().numpy()[batch] t = target[batch].numpy() m = mask[batch].numpy() mmwrite( OutputDir + "/output/pred" + str(sample_idx.numpy()[batch]), lil_matrix(p)) mmwrite( OutputDir + "/output/true" + str(sample_idx.numpy()[batch]), lil_matrix(t)) mmwrite( OutputDir + "/output/mask" + str(sample_idx.numpy()[batch]), lil_matrix(m))
def __init__(self, programEntities): nusers = len(programEntities.userIndex.keys()) self.numFriends = np.zeros((nusers)) self.userFriends = ss.dok_matrix((nusers, nusers)) fin = open("user_friends.csv", 'rb') fin.readline() # skip header ln = 0 for line in fin: if ln % 200 == 0: print("Loading line: ", ln) cols = line.strip().split(",") user = cols[0] if programEntities.userIndex.has_key(user): friends = cols[1].split(" ") i = programEntities.userIndex[user] self.numFriends[i] = len(friends) for friend in friends: if programEntities.userIndex.has_key(friend): j = programEntities.userIndex[friend] # the objective of this score is to infer the degree to # and direction in which this friend will influence the # user's decision, so we sum the user/event score for # this user across all training events. eventsForUser = programEntities.userEventScores.getrow( j).todense() score = eventsForUser.sum() / np.shape( eventsForUser)[1] self.userFriends[i, j] += score self.userFriends[j, i] += score ln += 1 fin.close() # 归一化数组 sumNumFriends = self.numFriends.sum(axis=0) self.numFriends = self.numFriends / sumNumFriends sio.mmwrite("UF_numFriends", np.matrix(self.numFriends)) self.userFriends = normalize(self.userFriends, norm="l1", axis=0, copy=False) sio.mmwrite("UF_userFriends", self.userFriends)
def vector_word(): with open( 'I:\MeachineLearnProject\SpamMessage-LR-Twt/RawData/train_content.json', 'r') as f: content = json.load(f) with open( 'I:\MeachineLearnProject\SpamMessage-LR-Twt/RawData/train_label.json', 'r') as f: label = json.load(f) ''' vec_count = MessageCountVectorizer(min_df=2, max_df=0.8) data_count = vec_count.fit_transform(content) name_count_feature = vec_count.get_feature_names() ''' content_sub = content[0:100000] label_sub = label[0:100000] return content_sub, label_sub #print(content_sub) #vec_tfidf = TfidfVectorizer(min_df=2, max_df=0.8) #vec_tfidf = TfidfVectorizer() #data_tfidf = vec_tfidf.fit_transform(content_sub) #weight = vec_tfidf.fit_transform(content_sub).toarray() #print(data_tfidf) #name_tfidf_feature = vec_tfidf.get_feature_names() #print(name_tfidf_feature) #DecisionTreeClassifyTfidf(data_tfidf,,name_tfidf_feature) io.mmwrite( 'I:\MeachineLearnProject\SpamMessage-LR-Twt/Data/word_vector_sub.mtx', data_tfidf) with open( 'I:\MeachineLearnProject\SpamMessage-LR-Twt/Data/train_label_sub.json', 'w') as f: json.dump(label, f) with open( 'I:\MeachineLearnProject\SpamMessage-LR-Twt/Data/vector_type_sub.json', 'w') as f: json.dump(name_tfidf_feature, f)
def get_debug(data): full_train = sio.mmread('data/%s_train.mtx' % data).tocsr() (nu, nm) = full_train.shape print 'sampling' debug_mids = sample(range(nm), nm / 5) debug_uids = sample(range(nu), nu / 5) debug = full_train[debug_uids][:, debug_mids].tocoo() nr = debug.nnz train_ids, _, test_ids = sample_split(nr) # build matrix from given indices print 'writing debug_train' debug_train = coo_matrix( (debug.data[train_ids], (debug.row[train_ids], debug.col[train_ids])), debug.shape) sio.mmwrite('data/%s_debug_train.mtx' % data, debug_train) print 'writing debug_test' debug_test = coo_matrix( (debug.data[test_ids], (debug.row[test_ids], debug.col[test_ids])), debug.shape) sio.mmwrite('data/%s_debug_test.mtx' % data, debug_test) # build movie mtx from debug_mids print 'movie debug' movies = sio.mmread('data/movies.mtx').tocsr() movies_debug = movies[debug_mids] sio.mmwrite('data/movies_%s_debug.mtx' % data, movies_debug) return debug, debug_train, debug_test, movies_debug
def __init__(self, programEntities, psim=ssd.correlation, csim=ssd.cosine): cleaner = DataCleaner() fin = open("../Data/events.csv", 'rb') fin.readline() # skip header nevents = len(programEntities.eventIndex.keys()) self.eventPropMatrix = ss.dok_matrix((nevents, 7)) self.eventContMatrix = ss.dok_matrix((nevents, 100)) ln = 0 for line in fin.readlines(): # if ln > 10: # break cols = line.strip().split(",") eventId = cols[0] if programEntities.eventIndex.has_key(eventId): i = programEntities.eventIndex[eventId] self.eventPropMatrix[i, 0] = cleaner.getJoinedYearMonth(cols[2]) # start_time self.eventPropMatrix[i, 1] = cleaner.getFeatureHash(cols[3]) # city self.eventPropMatrix[i, 2] = cleaner.getFeatureHash(cols[4]) # state self.eventPropMatrix[i, 3] = cleaner.getFeatureHash(cols[5]) # zip self.eventPropMatrix[i, 4] = cleaner.getFeatureHash(cols[6]) # country self.eventPropMatrix[i, 5] = cleaner.getFloatValue(cols[7]) # lat self.eventPropMatrix[i, 6] = cleaner.getFloatValue(cols[8]) # lon for j in range(9, 109): self.eventContMatrix[i, j-9] = cols[j] ln += 1 fin.close() self.eventPropMatrix = normalize(self.eventPropMatrix, norm="l1", axis=0, copy=False) sio.mmwrite("../Models/EV_eventPropMatrix", self.eventPropMatrix) self.eventContMatrix = normalize(self.eventContMatrix, norm="l1", axis=0, copy=False) sio.mmwrite("../Models/EV_eventContMatrix", self.eventContMatrix) # calculate similarity between event pairs based on the two matrices self.eventPropSim = ss.dok_matrix((nevents, nevents)) self.eventContSim = ss.dok_matrix((nevents, nevents)) for e1, e2 in programEntities.uniqueEventPairs: i = programEntities.eventIndex[e1] j = programEntities.eventIndex[e2] if not self.eventPropSim.has_key((i,j)): epsim = psim(self.eventPropMatrix.getrow(i).todense(), self.eventPropMatrix.getrow(j).todense()) self.eventPropSim[i, j] = epsim self.eventPropSim[j, i] = epsim if not self.eventContSim.has_key((i,j)): ecsim = csim(self.eventContMatrix.getrow(i).todense(), self.eventContMatrix.getrow(j).todense()) self.eventContSim[i, j] = epsim self.eventContSim[j, i] = epsim sio.mmwrite("../Models/EV_eventPropSim", self.eventPropSim) sio.mmwrite("../Models/EV_eventContSim", self.eventContSim)
def load_stats(stats_f, vocab_f): """ Validate and load the input stats """ stats = sio.mmread(stats_f) if vocab_f: vocab = read_simple_flist(vocab_f) # Check the compatibility of stats if stats.shape[1] == len(vocab): stats = stats.T print("Transposed the stats to make them word-by-doc.") sio.mmwrite(os.path.realpath(stats_f), stats) if stats.shape[0] != len(vocab): print( "Number of rows in stats should match with length of vocabulary." ) print("Given stats:", stats.shape[0], "vocab. length:", len(vocab)) sys.exit() return stats.tocsc()
def __init__(self, programEntities=None, isClean=True): print("统计活跃度初始化开始...") if isClean == False: raise ImportError("不进行统计活跃度会导致结果不准确!请改成True或不赋值成False") self.programEntities = programEntities self.num_events = len(self.programEntities.eventIndex.keys()) self.eventPopularity = ss.dok_matrix((self.num_events, 1)) with open('event_attendees.csv', 'rb') as reader: reader = str(reader.readline()) for line in reader: cols = line.strip().split(",") eventId = cols[0] if eventId in self.programEntities.eventIndex: i = self.programEntities.eventIndex[eventId] self.eventPopularity[i, 0] = len(cols[1].split(" ")) - len( cols[4].split(" ")) self.eventPopularity = normalize(self.eventPopularity, norm="l1", axis=0, copy=False) sio.mmwrite("Event_Popularity", self.eventPopularity) print("统计活跃度结束...\n\n{}\n".format("*" * 200))
def build_text_vector(fin, stopwords_pattern): """ Create temporary fields by concatenating text columns to form a new column and generate a vector of term frequencies. """ print "Building text vector..." fout = str.replace(fin, ".csv", ".text.mtx") if os.path.isfile(fout): return ftmp = str.replace(fin, ".csv", ".tmp") reader = csv.reader(open(fin, 'rb')) tmpwriter = open(ftmp, 'wb') ln = 0 for row in reader: ln += 1 if ln <= 1: continue # skip header if ln % 1000 == 0: print "...(processed %d lines)" % (ln) title = row[1] full_description = extract_keywords(row[2], stopwords_pattern) loc_raw = row[3] tmpwriter.write(" ".join( [title, title, title, title, full_description, loc_raw, loc_raw]) + "\n") tmpwriter.close() vectorizer = sft.CountVectorizer(max_features=1000) # vectorizer = sft.TfidfVectorizer( # charset_error="ignore", # strip_accents="ascii", # stop_words="english", # max_features=100, # use_idf=False) tmpreader = open(ftmp, 'rb') tdmatrix = vectorizer.fit_transform(tmpreader) os.remove(ftmp) writer = open(fout, 'wb') sio.mmwrite(writer, tdmatrix) writer.close()
def save_randomforest_path(model_path, w2v_path): if os.path.exists(model_path): print("the model already exists.") clf = joblib.load(model_path) else: print("the model doesn't exists.") return None w2v_list = list() for root, dirs, files in os.walk(w2v_path): for file in files: if os.path.splitext(file)[1] == '.txt': w2v_list.append(file) w2v_list.sort() for w2v_name in w2v_list: filename = BasePath + "/w2v_corpus/" + w2v_name w2v_vec = np.loadtxt(filename) print(w2v_vec.shape) path_of_sample, _ = clf.decision_path(w2v_vec) save_file_path = BasePath + "/rf_path/" + "path_" + w2v_name.split( '.')[0] + ".mtx" io.mmwrite(save_file_path, path_of_sample)
def __init__(self, programEntities, sim=ssd.correlation): cleaner = DataCleaner() nusers = len(programEntities.userIndex.keys()) fin = open("../data/users.csv", 'rb') colnames = fin.readline().strip().split(",") self.userMatrix = ss.dok_matrix((nusers, len(colnames) - 1)) for line in fin: cols = line.strip().split(",") # 只考虑train.csv中出现的用户 if programEntities.userIndex.has_key(cols[0]): i = programEntities.userIndex[cols[0]] ##将数据进行预处理再放进userMatrix矩阵中 self.userMatrix[i, 0] = cleaner.getLocaleId(cols[1]) self.userMatrix[i, 1] = cleaner.getBirthYearInt(cols[2]) self.userMatrix[i, 2] = cleaner.getGenderId(cols[3]) self.userMatrix[i, 3] = cleaner.getJoinedYearMonth(cols[4]) self.userMatrix[i, 4] = cleaner.getCountryId(cols[5]) self.userMatrix[i, 5] = cleaner.getTimezoneInt(cols[6]) fin.close() # 归一化用户矩阵 self.userMatrix = normalize(self.userMatrix, norm="l1", axis=0, copy=False) sio.mmwrite("US_userMatrix", self.userMatrix) # 计算用户相似度矩阵,之后会用到 self.userSimMatrix = ss.dok_matrix((nusers, nusers)) for i in range(0, nusers): self.userSimMatrix[i, i] = 1.0 for u1, u2 in programEntities.uniqueUserPairs: i = programEntities.userIndex[u1] j = programEntities.userIndex[u2] if not self.userSimMatrix.has_key((i, j)): usim = sim( self.userMatrix.getrow(i).todense(), self.userMatrix.getrow(j).todense()) self.userSimMatrix[i, j] = usim self.userSimMatrix[j, i] = usim sio.mmwrite("US_userSimMatrix", self.userSimMatrix)
def Users(): pr = ProgramEntities() nusers = len(pr.userIndex) sim = ssd.correlation cleaner = DataCleaner() fin = open(r"D:\kaggle_data\event_recommendation\users.csv", 'r') colnames = fin.readline().strip().split(",") userMatrix = sparse.dok_matrix((nusers, len(colnames)-1)) # 稀疏矩阵 for line in fin: cols = line.strip().split(",") # 只考虑train.csv中出现的用户 if cols[0] in pr.userIndex.keys(): i = pr.userIndex[cols[0]] userMatrix[i, 0] = cleaner.getLocaleId(cols[1]) userMatrix[i, 1] = cleaner.getBirthYearInt(cols[2]) userMatrix[i, 2] = cleaner.getGenderId(cols[3]) userMatrix[i, 3] = cleaner.getJoinedYearMonth(cols[4]) userMatrix[i, 4] = cleaner.getCountryId(cols[5]) userMatrix[i, 5] = cleaner.getTimezoneInt(cols[6]) fin.close() # print(userMatrix) # 归一化用户矩阵,并存起来 userMatrix_N = normalize(userMatrix, norm='l1', axis=0) # axis=0 沿着列归一化 # sio.mmwrite("US_userMatrix", userMatrix_N) # 生成一个用户相似度矩阵 userSimMatrix = sparse.dok_matrix((nusers, nusers)) for i in range(nusers): userSimMatrix[i, i] = 1.0 for u1,u2 in pr.uniqueUserPairs: i = pr.userIndex[u1] j = pr.userIndex[u2] if (i not in userSimMatrix.keys() and j not in userSimMatrix.keys()): usim = sim(userMatrix_N.getrow(i).todense(), userMatrix_N.getrow(j).todense()) userSimMatrix[i, j] = usim userSimMatrix[j, i] = usim sio.mmwrite("US_userSimMatrix", userSimMatrix)
def main(): """ Main entry point to script to perform spectral co-clustering. Returns: - `0` or `1` on success or failure respectively. - Saves `centroids`, `centroiddict`, `clusters` and `clusterdict` in \ working dir. """ parser = gen_args() args = parser.parse_args() sessionid = args.sessionid A = spio.mmread(args.A).tocsc() logger = logging.getLogger(__name__) logger.addHandler(logging.StreamHandler()) if args.verbose: logger.setLevel(logging.DEBUG) if args.k: k = args.k spcc = SpectralCoClusterer(A, k, args.n, args.delta, \ args.randomcentroids, \ args.classical, args.verbose) result = spcc.run() clusters = result['clusters'] centroids = result['centroids'] centroid_dict = result['centroiddict'] cluster_dict = result['clusterdict'] cPickle.dump(clusters, open("clusters_" + sessionid + '.pck', 'w')) cPickle.dump(centroid_dict, open("centroid_dict_" + \ sessionid + '.pck', 'w')) cPickle.dump(cluster_dict, open("cluster_dict_" + \ sessionid + '.pck', 'w')) spio.mmwrite(open("centroids_" + sessionid + '.mtx', 'w'), \ centroids, comment="CSC Matrix", field='real') logger.info(" %d Clusters Generated ", len(clusters)) return 0
def genHashes(): with open('pickled_minhash/feature_matrix_binary_sample.npy', 'rb') as f: #size of feature_matrix_large: 1261 x 19043 feature_matrix = np.load(f) with open('pickled_minhash/actual_jaccard_matrix_small.mtx', 'rb') as f: baseline = io.mmread(f) baseline = baseline.todok() jaccard_matrix = [] #jaccard_matrix_pre is a list of arrays that contain non-zero indicies of each article in the corpus for i in feature_matrix[0:test_num]: indicies = np.flatnonzero(i) jaccard_matrix.append(indicies) k_vals = [16, 32, 64, 128, 256] for k in k_vals: #calculate minHash with k =16 S = sparse.dok_matrix((len(jaccard_matrix), len(jaccard_matrix))) t0 = time.time() hashmap = {} for i in range(0, len(jaccard_matrix)): mh = MinHash(num_perm=k) for d in jaccard_matrix[i]: mh.digest(sha1(struct.pack("!I", d))) hashmap[i] = mh for i in range(0, len(jaccard_matrix)): m8_1 = hashmap[i] for j in range(0, i + 1): m8_2 = hashmap[j] estj = MinHash.jaccard(m8_1, m8_2) if estj != 0 and estj != 1: S[i, j] = estj print("Time to calculate first %s estj (k=%s): %f" % (len(jaccard_matrix), k, time.time() - t0)) with open('pickled_minhash/estj_' + str(k) + '_small.mtx', 'wb') as f: #size of feature_matrix_large: 1261 x 19043 io.mmwrite(f, S)
def save_sparse_matrix(data, fmt, filepath): """ Save a scipy sparse matrix in the specified format. Row and column indices will be converted to 1-indexed if you specify a plain text format (tsv, csv, mm). Note that zero entries are guaranteed to be saved in tsv or csv format. Parameters ---------- data : scipy sparse matrix to save fmt : str Specifies the file format to write: - tsv - csv - mm (MatrixMarket) - npz (save as npz archive of numpy arrays) - fsm (mrec.sparse.fast_sparse_matrix) filepath : str The file to load. """ if fmt == 'tsv': m = data.tocoo() with open(filepath, 'w') as out: for u, i, v in izip(m.row, m.col, m.data): print >> out, '{0}\t{1}\t{2}'.format(u + 1, i + 1, v) elif fmt == 'csv': m = data.tocoo() with open(filepath, 'w') as out: for u, i, v in izip(m.row, m.col, m.data): print >> out, '{0},{1},{2}'.format(u + 1, i + 1, v) elif fmt == 'mm': mmwrite(filepath, data) elif fmt == 'npz': savez(data.tocoo(), filepath) elif fmt == 'fsm': fast_sparse_matrix(data).save(filepath) else: raise ValueError('unknown output format: {0}'.format(fmt))
def index(search_dir, index_dir): cmd = 'pdftotext "%s" %s/loog.txt' dirs, files = rsync.ls(search_dir) files = [(f, size) for (f, size) in files if '.pdf' in f] N = len(files) A = sps.lil_matrix((N, cols)) print A.shape df_files = [] for i, (f, size) in enumerate(files): file = f.replace("\\", "/") print file if ".pdf" in file: cmd2 = cmd % (f, os.environ['TEMP']) os.system(cmd2) lowers = open( "%s/loog.txt" % os.environ['TEMP']).read().decode("ISO-8859-1").lower() tokens = nltk.word_tokenize(lowers) tokens = stem_tokens(tokens) print tokens[:30] for token in tokens: A[i, hash(token) % cols] += 1 df_files.append([file, size]) df = A.copy() df[df > 0] = 1. df = np.array(df.sum(axis=0)) idf = df.copy() idf[df.nonzero()] = np.log(N / df[df.nonzero()]) io.mmwrite(index_dir + "/loogle_idf.mtx", idf) tf = A.copy().tocoo() tf.data = 1 + np.log(tf.data) tfidf = sps.csr_matrix(tf.multiply(idf)) tfidf = normalize(tfidf, norm='l2', axis=1) io.mmwrite(index_dir + "/loogle_tfidf.mtx", tfidf) df_files = pd.DataFrame(df_files, columns=['file', 'size']) df_files.to_csv(index_dir + "/loogle_files.csv", index=None)
def main(): usage = "" # TODO parser = OptionParser(usage=usage) #parser.add_option("-a", "--a_descrip", action="store_true", help="This is a flat") parser.add_option("-o", "--out_dir", help="Output directory") (options, args) = parser.parse_args() dataset = args[0] out_dir = options.out_dir r = load_dataset.load_dataset(dataset, 'counts') the_exps = r[3] data_matrix = r[10] gene_ids = r[11] genes_df = pd.read_csv('genes.tsv', sep='\t', index_col=0, header=None) genes_df = genes_df.loc[gene_ids] with open(join(out_dir, 'matrix.mtx'), 'wb') as f: mmwrite(f, coo_matrix(data_matrix.T)) with open(join(out_dir, 'barcodes.tsv'), 'w') as f: f.write('\n'.join(the_exps)) genes_df.to_csv(join(out_dir, 'genes.tsv'), sep='\t', header=False)
def calculate(words, feature, fname): e = np.zeros((len(words), len(words))) for row in range(0, len(words)): for column in range(row, len(words)): if row < column: tem = (np.dot(feature[row], feature[column]))\ / (np.linalg.norm(feature[row])*np.linalg.norm(feature[column])) e[row][column] = np.arccos(-tem) e[column][row] = e[row][column] for i in range(0, len(words)): kth_max = np.sort(e[i]) zero_list = [0] e_row = e[i] e_row = np.where(e_row < kth_max[len(words) - 300], zero_list, e_row) e[i] = e_row for row in range(0, len(words)): for column in range(row, len(words)): if e[row][column] != e[column][row]: e[row][column] = 0 e[column][row] = 0 sparse_e = sparse.csr_matrix(e) io.mmwrite(fname, sparse_e) return
def save_data(sco, outputpath): mmf = os.path.join(outputpath, 'expr_m.mtx') # with open(outputpath + 'expr_m.mtx', 'w') as f: # mmwrite(f, sco.expression_matrix) mmwrite(mmf, csc_matrix(sco.expression_matrix)) sc_info = [] sc_info.append(['meta_info']) meta_info = sco.meta_info p = sco.processed p.append('save data') meta_info['processed'] = p meta_info = list([list(meta_info.keys()), list(meta_info.values())]) sc_info.append(meta_info) sc_info[0].append("genes_list") sc_info.append(list(sco.gene_ref.get_list())) for i in sco.cell_info.data_names: sc_info[0].append(i) sc_info.append(list(sco.cell_info[i])) with open(os.path.join(outputpath, 'info.json'), 'w') as f: json.dump(sc_info, f)
def test_sparse_formats(self): mats = [] I = array([0, 0, 1, 2, 3, 3, 3, 4]) J = array([0, 3, 1, 2, 1, 3, 4, 4]) V = array([1.0, 6.0, 10.5, 0.015, 250.5, -280.0, 33.32, 12.0]) mats.append(scipy.sparse.coo_matrix((V, (I, J)), shape=(5, 5))) V = array([ 1.0 + 3j, 6.0 + 2j, 10.50 + 0.9j, 0.015 + -4.4j, 250.5 + 0j, -280.0 + 5j, 33.32 + 6.4j, 12.00 + 0.8j ]) mats.append(scipy.sparse.coo_matrix((V, (I, J)), shape=(5, 5))) for mat in mats: expected = mat.toarray() for fmt in ['csr', 'csc', 'coo']: fn = mktemp(dir=self.tmpdir) # safe, we own tmpdir mmwrite(fn, mat.asformat(fmt)) result = mmread(fn).toarray() assert_array_almost_equal(result, expected)
def test_gzip_py3(self): # test if fix for #2152 works try: # gzip module can be missing from Python installation import gzip except ImportError: return I = array([0, 0, 1, 2, 3, 3, 3, 4]) J = array([0, 3, 1, 2, 1, 3, 4, 4]) V = array([1.0, 6.0, 10.5, 0.015, 250.5, -280.0, 33.32, 12.0]) b = scipy.sparse.coo_matrix((V, (I, J)), shape=(5, 5)) mmwrite(self.fn, b) fn_gzip = "%s.gz" % self.fn with open(self.fn, 'rb') as f_in: f_out = gzip.open(fn_gzip, 'wb') f_out.write(f_in.read()) f_out.close() a = mmread(fn_gzip).toarray() assert_array_almost_equal(a, b.toarray())
def test_bzip2_py3(self): # test if fix for #2152 works try: # bz2 module isn't always built when building Python. import bz2 except ImportError: return I = array([0, 0, 1, 2, 3, 3, 3, 4]) J = array([0, 3, 1, 2, 1, 3, 4, 4]) V = array([1.0, 6.0, 10.5, 0.015, 250.5, -280.0, 33.32, 12.0]) b = scipy.sparse.coo_matrix((V, (I, J)), shape=(5, 5)) mmwrite(self.fn, b) fn_bzip2 = "%s.bz2" % self.fn with open(self.fn, 'rb') as f_in: f_out = bz2.BZ2File(fn_bzip2, 'wb') f_out.write(f_in.read()) f_out.close() a = mmread(fn_bzip2).toarray() assert_array_almost_equal(a, b.toarray())
def export_matrix(self, filename, matrix_name=None, output_format='matlab', mu=None): """Save the matrix of the operator to a file. Parameters ---------- filename Name of output file. matrix_name The name, the output matrix is given. (Comment field is used in case of Matrix Market output_format.) If `None`, the |Operator|'s `name` is used. output_format Output file format. Either `matlab` or `matrixmarket`. mu The |Parameter| to assemble the to be exported matrix for. """ assert output_format in {'matlab', 'matrixmarket'} matrix = self.assemble(mu).matrix matrix_name = matrix_name or self.name if output_format == 'matlab': savemat(filename, {matrix_name: matrix}) else: mmwrite(filename, matrix, comment=matrix_name)
def write_matrix_10X(self, df, matrix_dir): if not os.path.exists(matrix_dir): os.mkdir(matrix_dir) df_UMI = df.groupby(['geneID', 'Barcode']).agg({'UMI': 'count'}) mtx = coo_matrix( (df_UMI.UMI, (df_UMI.index.codes[0], df_UMI.index.codes[1]))) gene_id = df_UMI.index.levels[0].to_series() # add gene symbol gene_name = gene_id.apply(lambda x: self.gtf_dict[x]) genes = pd.concat([gene_id, gene_name], axis=1) genes.columns = ['gene_id', 'gene_name'] barcodes = df_UMI.index.levels[1].to_series() genes.to_csv(f'{matrix_dir}/{FEATURE_FILE_NAME}', index=False, sep='\t', header=False) barcodes.to_csv(f'{matrix_dir}/{BARCODE_FILE_NAME}', index=False, sep='\t', header=False) mmwrite(f'{matrix_dir}/{MATRIX_FILE_NAME}', mtx)