Esempio n. 1
0
def generate_training_validating_rt(version, r_to_i, u_to_i, r_u_t_fn,
                                    split, is_test=False):
    """Function called to generate training.mtx, validating.mtx and
       recommendation_times.npy
    """
    if is_test:
        data_processed_dir = join(PROCESSED_DATA_DIR, "test")
    else:
        data_processed_dir = PROCESSED_DATA_DIR

    u_r_times = mmread(r_u_t_fn).transpose().tolil()
    nu, nr = u_r_times.shape

    training_matrix = lil_matrix((nu,nr), dtype=np.int_)
    validating_matrix = lil_matrix((nu,nr), dtype=np.int_)
    recommendation_times = np.zeros(nu, dtype=np.int_)

    valid_repositories_table = version+"_repositories"
    cursor = getDB(is_test=is_test).cursor()

    for uidx in xrange(nu):
        cursor.execute("""SELECT vr.id
                          FROM repositories as r,
                               {} as vr
                          WHERE vr.id = r.id AND r.owner_id = %s
                       """.format(valid_repositories_table), (u_to_i.r(uidx),))
        owned_rs = np.array([r_to_i[r[0]] for r in cursor])
        interests = u_r_times.getrowview(uidx)
        interested_rs = np.unique(interests.tocoo().col)
        ext_rs = np.setdiff1d(interested_rs, owned_rs, assume_unique=True)
        times = interests[0,ext_rs].toarray()[0]
        sorted_indices = times.argsort()
        threshold = int(floor(split*len(ext_rs)))
        training = [ext_rs[i] for i in sorted_indices[:threshold]]
        threshold_time = times[sorted_indices[threshold]]
        training += [r for r in owned_rs if interests[0,r] < threshold_time]
        validating = [ext_rs[i] for i in sorted_indices[threshold:]]

        for t in training:
            training_matrix[uidx,t] = 1
        for v in validating:
            validating_matrix[uidx,v] = 1
        recommendation_times[uidx] = threshold_time

    comment="""
Training interests are before validating interests.
The split is as follows:
    Training: all internals before first last 1/3 externals + first 2/3 externals
    Testing: last 1/3 externals"""

    version_dir = join(data_processed_dir, version)
    tfn = join(version_dir, TRAINING_FN)
    vfn = join(version_dir, VALIDATING_FN)
    rtfn = join(version_dir, RECOMMENDATION_TIMES_FN)

    mmwrite(tfn, training_matrix, comment=comment)
    mmwrite(vfn, validating_matrix, comment=comment)
    np.save(rtfn, recommendation_times)

    return (tfn, vfn, rtfn)
def gen_app_pop_count(dev_app, ga_train, ga_test, base_dir='/data'):
    start_time = time.time()

    print('generating popularity weighted app count per device')

    app_popularity = dev_app.groupby(['app_id'])['device_id'].agg(
        {'popularity': lambda x: x.nunique()})
    app_pop_count = dev_app.groupby(['device_id'])['app_id'].agg(
        {'app_pop_count': lambda x: app_popularity.loc[x.unique(), 'popularity'].sum()})


    app_count_train = ga_train['device_id'].map(
        app_pop_count['app_pop_count']).fillna(0)
    app_count_train = app_count_train / app_count_train.max()

    app_count_train = csr_matrix(app_count_train.values).transpose()

    app_count_test = ga_test['device_id'].map(app_pop_count['app_pop_count']).fillna(0)
    app_count_test = app_count_test / app_count_test.max()

    app_count_test = csr_matrix(app_count_test.values).transpose()

    print('train set shape: ', app_count_train.shape)
    io.mmwrite(base_dir + "train_apppopcount.mtx", app_count_train)

    print('test set shape: ', app_count_test.shape)
    io.mmwrite(base_dir + "test_apppopcount.mtx", app_count_test)
    print('Time generating app pop count: ', (time.time() - start_time) / 60)
def encode() :
    """
    Generate extra features from pairs, triplets, and common
    quadruplets of the existing features and then save those features
    in a sparse matrix to disk.
    """
    dftrain = load_dataframe('train')
    dftest = load_dataframe('test')
    lentrain = len(dftrain)
    all_data = np.vstack((dftrain.ix[:,1:-1], dftest.ix[:,1:-1]))
    np.array(dftrain.ACTION).dump('{}/train_truth.dat'.format(ddir))
    
    dp = group_data(all_data, degree=2, remove_unique=True)
    dt = group_data(all_data, degree=3, remove_unique=True)
    dq = group_data(all_data, degree=4, remove_unique=True)
    dq = remove_rare(dq, 15)

    X = all_data[:lentrain]
    X_2 = dp[:lentrain]
    X_3 = dt[:lentrain]
    X_4 = dq[:lentrain]
    X_train_all = np.hstack((X, X_2, X_3, X_4))
    mmwrite('{}/train_encoded'.format(ddir), X_train_all)

    X_test = all_data[lentrain:]
    X_test_2 = dp[lentrain:]
    X_test_3 = dt[lentrain:]
    X_test_4 = dq[lentrain:]
    X_test_all = np.hstack((X_test, X_test_2, X_test_3, X_test_4))
    mmwrite('{}/test_encoded'.format(ddir), X_test_all)
Esempio n. 4
0
def build_nontext_vector(fin, colname, colidx, normalize):
  """
  Handles the specified column as a categorical variable.
  """
  print "Building category vector for %s" % (colname)
  fout = str.replace(fin, ".csv", "." + colname + ".mtx")
  if os.path.isfile(fout):
    return
  ftmp = str.replace(fin, ".csv", ".tmp")
  reader = csv.reader(open(fin, 'rb'))
  tmpwriter = open(ftmp, 'wb')
  ln = 0
  for row in reader:
    ln += 1
    if ln <= 1:
      continue
    if ln % 1000 == 0:
      print "...(processed %d lines)" % (ln)
    colval = str.lower(row[colidx])
    if normalize:
      colval = str.replace(colval, " ", "_")
    if len(colval.rstrip()) == 0:
      colval = "UNK"
    tmpwriter.write(colval + "\n")
  tmpwriter.close()
  tmpreader = open(ftmp, 'rb')
  vectorizer = sft.CountVectorizer(max_features=100)
  catmatrix = vectorizer.fit_transform(tmpreader)
  os.remove(ftmp)
  writer = open(fout, 'wb')
  sio.mmwrite(writer, catmatrix)
  writer.close()
Esempio n. 5
0
  def __init__(self, programEntities):
    nusers = len(programEntities.userIndex.keys())
    self.numFriends = np.zeros((nusers))
    self.userFriends = ss.dok_matrix((nusers, nusers))
    fin = open("../Data/user_friends.csv", 'rb')
    fin.readline()                # skip header
    ln = 0
    for line in fin:
#      if ln % 100 == 0:
#        print "Loading line: ", ln
      cols = line.strip().split(",")
      user = cols[0]
      if programEntities.userIndex.has_key(user):
        friends = cols[1].split(" ")
        i = programEntities.userIndex[user]
        self.numFriends[i] = len(friends)
        for friend in friends:
          if programEntities.userIndex.has_key(friend):
            j = programEntities.userIndex[friend]
            # the objective of this score is to infer the degree to
            # and direction in which this friend will influence the
            # user's decision, so we sum the user/event score for
            # this user across all training events.
            eventsForUser = programEntities.userEventScores.getrow(j).todense()
            score = eventsForUser.sum() / np.shape(eventsForUser)[1]
            self.userFriends[i, j] += score
            self.userFriends[j, i] += score
      ln += 1
    fin.close()
    # normalize the arrays
    sumNumFriends = self.numFriends.sum(axis=0)
    self.numFriends = self.numFriends / sumNumFriends
    sio.mmwrite("../Models/UF_numFriends", np.matrix(self.numFriends))
    self.userFriends = normalize(self.userFriends, norm="l1", axis=0, copy=False)
    sio.mmwrite("../Models/UF_userFriends", self.userFriends)
Esempio n. 6
0
  def __init__(self, programEvents):
        
    nevents = len(programEvents.eventIndex.keys())
    self.eventPopularity = ss.dok_matrix((nevents, 5))
    self.eventAttendees = collections.defaultdict(list)
    f = open("/users/chaitanya/PyCharmProjects/EventRec/data/event_attendees.csv", 'rb')
    f.readline() # skip header
    
    for line in f:
      cols = line.strip().split(",")
      eventId = cols[0]

      if programEvents.eventIndex.has_key(eventId):      
        i = programEvents.eventIndex[eventId]
        self.eventPopularity[i, 0] =           len(cols[1].split(" ")) - len(cols[4].split(" "))   # number of yes-no
        self.eventPopularity[i, 1] =           len(cols[3].split(" "))        # number of invited folks
        
        self.eventAttendees[i].append(cols[1].split(" "))
                                                            #list of yes folks
        
        self.eventAttendees[i].append(cols[2].split(" "))    #list of no folks
              
        self.eventAttendees[i].append(cols[3].split(" "))   #list of invited folks
                
    f.close()
    
    self.eventPopularity = normalize(self.eventPopularity, norm="l1",axis=0, copy=False)
    sio.mmwrite("/users/chaitanya/PyCharmProjects/EventRec/Models/EA_eventPopularity", self.eventPopularity)
    cPickle.dump(self.eventAttendees, open("/users/chaitanya/PyCharmProjects/EventRec/Models/PE_eventAttendees.pkl", 'wb'))
def genJaccard(feature_matrix):
   
    jaccard_matrix_pre = []

    #jaccard_matrix_pre is a list of arrays that contain non-zero indicies of each article in the corpus
    for i in feature_matrix[0:test_num]:
        indicies = np.flatnonzero(i)
        jaccard_matrix_pre.append(indicies)
    
    S=sparse.dok_matrix((test_num, test_num))
    t0=time.time()
    numi=0
    for i in jaccard_matrix_pre:
        jnum=0
        for j in jaccard_matrix_pre[0:numi+1]: #decrease number of calculations to n choose 2 instead of n^2
            diviser = float(len(set(i).union(set(j))))
            if diviser != 0:
                actual_jaccard = float(len(set(i).intersection(set(j))))/diviser
                if actual_jaccard != 0 and actual_jaccard!=1:
                    S[numi,jnum] = actual_jaccard
            jnum=jnum+1
        numi = numi+1          
    with open('pickled_minhash/actual_jaccard_matrix_small.mtx', 'wb') as f:
        #size of feature_matrix_large: 1261 x 19043
        io.mmwrite(f, S)
    
    print("TIME to generate jaccard_matrix: {}".format(time.time()-t0))
Esempio n. 8
0
def store_matrix(matrix='',
                 output_dir_path='',
                 out_file_name='',
                 output_format=''):
    """store_matrix."""
    if not os.path.exists(output_dir_path):
        os.mkdir(output_dir_path)
    full_out_file_name = os.path.join(output_dir_path, out_file_name)
    if output_format == "MatrixMarket":
        if len(matrix.shape) == 1:
            raise Exception(
                "'MatrixMarket' format supports only 2D dimensional array\
                and not vectors")
        else:
            io.mmwrite(full_out_file_name, matrix, precision=None)
    elif output_format == "numpy":
        np.save(full_out_file_name, matrix)
    elif output_format == "joblib":
        joblib.dump(matrix, full_out_file_name)
    elif output_format == "text":
        with open(full_out_file_name, "w") as f:
            if len(matrix.shape) == 1:
                for x in matrix:
                    f.write("%s\n" % (x))
            else:
                raise Exception(
                    "'text' format supports only mono dimensional array\
                    and not matrices")
    logger.info("Written file: %s" % full_out_file_name)
def get_content_similarity_scores(readmes, dataset_dir, profile="tfidf",
                                  similarity="cos"):
    """Return CSR matrix of similarity_{r,r} for all r in `readmes`.

       `dataset_dir`      the directory where the similarity scores are
       `profile`    bool or tfidf
       `similarity` cos or ijd (inverse Jacquard Distance)
    """
    if profile == "tfidf":
        sim_fn = join(dataset_dir, TF_IDF_FN)

    if exists(sim_fn):
        return mmread(sim_fn).tocsr()

    if profile == "bool":
        #readme_words = COUNTVECTORIZER readmes
        pass
    else:
        tfidf = TfidfVectorizer(input='file', #sublinear_tf=True,
                                max_df=0.5, stop_words='english',
                                decode_error="ignore")
        #max_df=0.5: if a word occurs in more than half of the readmes it is
        #            ignored
        readme_words = tfidf.fit_transform(readmes)

    if similarity == "cos":
        similarity_scores = csr_matrix(cosine_similarity(readme_words))
    else:
        # similarity_scores = csr_matrix(ijd(readme_words))
        pass

    mmwrite(sim_fn, similarity_scores, comment=profile+"_"+similarity+"_similarity_{r,r}")
    return similarity_scores
Esempio n. 10
0
def SOP(alpha, teta, nbBasket, nbReco):
	
	data = load()
	###############################################################
	# CREATE MODELS
	###############################################################
	print 'Create the model based on the training set'
	
	modelSOP = processing.SOPRecoModel(data.getUserItemMatrix(), alpha, teta)
	modelSOP.launch()
		
	###############################################################
	# SET RECOMMENDATION
	###############################################################
	if nbBasket == -1:
		evalSOP = processing.Evaluation(modelSOP, data.getBasketItemList(), nbReco)
	else :
		evalSOP = processing.Evaluation(modelSOP, data.getBasketItemList()[:nbBasket], nbReco)
	
	###############################################################
	# LAUNCH RECOMMENDATION + SAVE RESULTS
	###############################################################	
	t = time.time()
	evalSOP.newEval()
	SOPTime = time.time()-t
	mmwrite('SOPPerf_a%s_t%s_nb%s_nr%s'%(alpha,teta,nbBasket,nbReco),evalSOP.perf) 
	
	print 'SOP Execution time:', SOPTime
	print 'Performances : '
	print evalSOP.testNames
	print evalSOP.meanPerf()
	evalSOP.savePerf('SOPPerf_a%s_t%s_nb%s_nr%s.txt'%(alpha,teta,nbBasket,nbReco))
	return evalSOP
Esempio n. 11
0
def RWWR(alpha, nbBasket, nbReco):
	data = load()
	###############################################################
	# CREATE MODELS
	###############################################################
	print 'Create the model based on the training set'
	
	modelRWWR = processing.RandomWalkWithRestartRecoModel(data.getUserItemMatrix(), alpha)
		
	###############################################################
	# SET RECOMMENDATION
	###############################################################
	if nbBasket == -1:
		evalRWWR = processing.Evaluation(modelRWWR, data.getBasketItemList(), nbReco)
	else :
		evalRWWR = processing.Evaluation(modelRWWR, data.getBasketItemList()[:nbBasket], nbReco)
	
	###############################################################
	# LAUNCH RECOMMENDATION + SAVE RESULTS
	###############################################################	
	t = time.time()
	evalRWWR.newEval()
	RWWRTime = time.time()-t
	mmwrite('RWWR_a%s_nb%s'%(alpha, nbBasket),evalRWWR.perf) 
	
	print 'RWWR Execution time:', RWWRTime
	print 'Performances :'
	print evalRWWR.testNames
	print evalRWWR.meanPerf()
	evalRWWR.savePerf('RWWR_a%s_nb%s'%(alpha, nbBasket))
	return evalRWWR
Esempio n. 12
0
def Cosine(nbBasket, nbReco):
	data = load()
	###############################################################
	# CREATE MODELS
	###############################################################
	print 'Create the model based on the training set'
	
	modelCosine = processing.CosineRecoModel(data.getUserItemMatrix())
		
	###############################################################
	# SET RECOMMENDATION
	###############################################################
	if nbBasket == -1:
		evalCosine = processing.Evaluation(modelCosine, data.getBasketItemList(), nbReco)
	else :
		evalCosine = processing.Evaluation(modelCosine, data.getBasketItemList()[:nbBasket], nbReco)
	
	###############################################################
	# LAUNCH RECOMMENDATION + SAVE RESULTS
	###############################################################	
	t = time.time()
	evalCosine.newEval()
	CosineTime = time.time()-t
	mmwrite('Cosine_nb%s'%nbBasket,evalCosine.perf) 
	
	print 'Cosine Execution time:', CosineTime
	print 'Performances :'
	print evalCosine.testNames
	print evalCosine.meanPerf()
	evalCosine.savePerf('Cosine_nb%s.txt'%nbBasket)
	return evalCosine
Esempio n. 13
0
def make_doc_vectors(fname_pat, out_fname):
    fnames = glob(fname_pat)
    labels = [splitext(basename(fn))[0] for fn in fnames]
    stop_words = frozenset(list(ENGLISH_STOP_WORDS) + OTHER_STOPWORDS)
    vectorizer = CountVectorizer(input="filename", 
                                 ngram_range=(1,3),
                                 min_df=5, 
                                 max_df=0.7,
                                 stop_words=stop_words,
                                 token_pattern=r"(?u)\b[A-Za-z]\w+\b")
    vectors = vectorizer.fit_transform(fnames)
    
    log.info("saving matrix in Numpy format to " + out_fname)
    np.savez(out_fname, 
             vectorizer=vectorizer,
             vectors=vectors,
             labels=labels)
    
    base_fname = splitext(out_fname)[0]
    
    mm_fname = base_fname + ".mtx"
    log.info("saving matrix in Matrix Market format to " + mm_fname)
    mmwrite(mm_fname, vectors, "IDIScape document vectors", "integer")

    feat_fname = base_fname + "_features.txt"
    log.info("saving features to " + feat_fname)
    feat_names = vectorizer.get_feature_names() 
    open(feat_fname, "w", "utf8").write(u"\n".join(feat_names))
    
    label_fname = base_fname + "_labels.txt"
    log.info("saving labels to " + label_fname)
    open(label_fname, "w", "utf8").write(u"\n".join(labels))    
Esempio n. 14
0
 def nearest_neighbor_degree_analysis(self, target, method="online"):
     """nearest neighbors' degree
     ref Empirical analysis of web-based user-object bipartite networks, Ming-Sheng Shang et al. 17 June 2010.
     target = 'user' means nearest neighbors' degree for users,
     target = 'item' means nearest neighbors' degree for items.        
     method = 'online' means online calculation,
     method = 'offline' means using offline results.        
     """
     filepath = "./offline_results/nn_degree"
     if method == "online":# online calculation
         tinynum = 0.00000001
         if target == "user":# for user
             self.ui_matrix = self.ui_matrix.tocsc()
             degree = sparse.csc_matrix(self.ui_matrix.sum(0))
             # degree = degree + sparse.csc_matrix(np.ones([1, self.usernum]))*tinynum# to avoid zero division
             nn_degree = sparse.csc_matrix(self.ui_matrix.sum(1).transpose())\
                 .dot(self.ui_matrix)/degree
         elif target == "item":# for item
             self.ui_matrix = self.ui_matrix.tocsr()
             degree = sparse.csr_matrix(self.ui_matrix.sum(1))
             # degree = degree + sparse.csr_matrix(np.ones([self.itemnum, 1]))*tinynum# to avoid zero division
             nn_degree = self.ui_matrix.dot(sparse.csr_matrix(self.ui_matrix.sum(0).transpose()))/degree
         else:
             print "target arg error !"
             sys.exit()
         if target == "user" or target == "item":
             try:
                 io.mmwrite(filepath+"_%s"%target, nn_degree)
             except Exception,e:
                 print e
                 sys.exit()
             return nn_degree
Esempio n. 15
0
    def run(self, ratio, input_db, output_mat):
        db = sqlite3.connect(input_db)
        # assume no empty users
        users = db.execute("""SELECT Users.[Id] FROM Users""").fetchall()
        # pick <ratio> of them for training db, pick <ratio/10> of them for test db
        train_ids = []
        test_ids = []

        test_threshold = ratio/10
        train_threshold = test_threshold + ratio
        for u in users:
            rnd = random.random()
            if (rnd <= test_threshold):
                test_ids.append(u[0])
            elif (rnd <= train_threshold):
                train_ids.append(u[0])

        train_matrix = self.data_to_matrix(db, train_ids).tocsc()
        test_matrix = self.data_to_matrix(db, test_ids).tocsc()

        (train_matrix, test_matrix) = self.trim_matrices(train_matrix, test_matrix)

        savemat(output_mat, {'train' : train_matrix,'test' : test_matrix}, oned_as = 'row')
        mmwrite(output_mat + '.train', train_matrix)
        mmwrite(output_mat + '.test', test_matrix)
        print("Done!")
Esempio n. 16
0
def main():
    """
        Main entry point to script to perform kmeans.

        Returns:

        - `0` or `1` on success or failure respectively.
        - Saves `centroids`, `centroiddict`, and `clusters` in working dir.

    """
    parser = gen_args()
    args = parser.parse_args()
    sessionid = args.sessionid
    data = spio.mmread(args.data).tocsc()
    logger = logging.getLogger(__name__)
    logger.addHandler(logging.StreamHandler())
    if args.verbose:
        logger.setLevel(logging.DEBUG)
    if args.k:
        k = args.k
    kmeans = KMeans(data, k, args.n, args.delta, args.randomcentroids, \
                    args.classical, args.verbose)
    result = kmeans.run()
    clusters = result['clusters']
    centroids = result['centroids']
    centroiddict = result['centroiddict']
    cPickle.dump(clusters, open("data_clusters_" + sessionid + '.pck', 'w'))
    cPickle.dump(centroiddict, open("centroid_dict_" + \
                                    sessionid + '.pck', 'w'))
    spio.mmwrite(open("data_centroids_" + sessionid + '.mtx', 'w'), \
                 centroids, comment="CSC Matrix", field='real')
    logger.info(" %d Clusters Generated ", len(clusters))
    return 0
Esempio n. 17
0
 def __init__(self, programEntities, sim=ssd.correlation):
   cleaner = DataCleaner()
   nusers = len(programEntities.userIndex.keys())
   fin = open("../Data/users.csv", 'rb')
   colnames = fin.readline().strip().split(",")
   self.userMatrix = ss.dok_matrix((nusers, len(colnames) - 1))
   for line in fin:
     cols = line.strip().split(",")
     # consider the user only if he exists in train.csv
     if programEntities.userIndex.has_key(cols[0]):
       i = programEntities.userIndex[cols[0]]
       self.userMatrix[i, 0] = cleaner.getLocaleId(cols[1])
       self.userMatrix[i, 1] = cleaner.getBirthYearInt(cols[2])
       self.userMatrix[i, 2] = cleaner.getGenderId(cols[3])
       self.userMatrix[i, 3] = cleaner.getJoinedYearMonth(cols[4])
       self.userMatrix[i, 4] = cleaner.getCountryId(cols[5])
       self.userMatrix[i, 5] = cleaner.getTimezoneInt(cols[6])
   fin.close()
   # normalize the user matrix
   self.userMatrix = normalize(self.userMatrix, norm="l1", axis=0, copy=False)
   sio.mmwrite("../Models/US_userMatrix", self.userMatrix)
   # calculate the user similarity matrix and save it for later
   self.userSimMatrix = ss.dok_matrix((nusers, nusers))
   for i in range(0, nusers):
     self.userSimMatrix[i, i] = 1.0
   for u1, u2 in programEntities.uniqueUserPairs:
     i = programEntities.userIndex[u1]
     j = programEntities.userIndex[u2]
     if not self.userSimMatrix.has_key((i, j)):
       usim = sim(self.userMatrix.getrow(i).todense(),
         self.userMatrix.getrow(j).todense())
       self.userSimMatrix[i, j] = usim
       self.userSimMatrix[j, i] = usim
   sio.mmwrite("../Models/US_userSimMatrix", self.userSimMatrix)
def main(argv):
    inputfile = ''
    outputfile = ''
 
    try:
        opts, args = getopt.getopt(argv,"hi:o:",["ifile=","ofile="])
    except getopt.GetoptError:
        print 'python <script name> -i <inputfile> -o <outputfile>'
        sys.exit(2)
 
    for opt, arg in opts:
        if opt == '-h':
            print 'python <script name> -i <inputfile> -o <outputfile>'
            sys.exit()
        elif opt in ("-i"):
            inputfile = arg
        elif opt in ("-o"):
            outputfile = arg
 
    print 'Reading images from "', inputfile
    print 'Writing vectors to "', outputfile

    input_file_name = inputfile

    """ ################### START PROGRAM ############################ """  

    print "--------------STRAT-----------------"
    running_time = time.now()
    data_matrix = process(input_file_name)
    print "Total time:" + str(time.now() - running_time)
    print "Writing to file."
    io.mmwrite(outputfile, data_matrix)
Esempio n. 19
0
def test4():
    n = 3 ; p = 2
    geo = periodic_square(n=[n,n], p=[p,p])
    geo_ref, list_lmatrices = geo.bezier_extract()
    from scipy.io import mmwrite
    M = list_lmatrices[0][0]
    mmwrite("M_conversion_test_4.mtx", M)
Esempio n. 20
0
def RW_POP(alpha, nbBasket, nbReco):
	data = load()
	###############################################################
	# CREATE MODELS
	###############################################################
	print 'Create the model based on the training set'
	
	modelRW = processing.BasketRandomWalk_POP(data.getUserItemMatrix(), alpha)
		
	###############################################################
	# SET RECOMMENDATION
	###############################################################
	if nbBasket == -1:
		evalRW = processing.Evaluation(modelRW, data.getBasketItemList(), nbReco)
	else :
		evalRW = processing.Evaluation(modelRW, data.getBasketItemList()[:nbBasket], nbReco)
	
	###############################################################
	# LAUNCH RECOMMENDATION + SAVE RESULTS
	###############################################################	
	t = time.time()
	evalRW.newEval()
	RWTime = time.time()-t
	mmwrite(resultFolder+'RW_POP_a%s_nb%s'%(alpha, nbBasket),evalRW.perf) 
	
	print 'RW_POP Execution time:', RWTime
	print 'Performances :'
	print evalRW.testNames
	print evalRW.computePerf()
	evalRW.savePerf(resultFolder+'RW_POP_a%s_nb%s.txt'%(alpha, nbBasket))
	return evalRW
Esempio n. 21
0
def save_results(results, name, version="1_1"):

    expt_action_mean, expt_action_std, \
        expt_reward_mean, expt_reward_std, epsilon = results

    if expt_action_mean is not None:
        mmwrite("./out/ex%s_%s_action_mean.mtx" % (version, name.lower()),
                expt_action_mean,
                "Experiment %s %s actions mean." % (version, name))

    if expt_action_std is not None:
        mmwrite("./out/ex%s_%s_action_std.mtx" % (version, name.lower()),
                expt_action_std,
                "Experiment %s %s actions SD." % (version, name))

    if expt_reward_mean is not None:
        mmwrite("./out/ex%s_%s_reward_mean.mtx" % (version, name.lower()),
                expt_reward_mean,
                "Experiment %s %s rewards mean." % (version, name))

    if expt_reward_std is not None:
        mmwrite("./out/ex%s_%s_reward_std.mtx" % (version, name.lower()),
                expt_reward_std,
                "Experiment %s %s rewards SD." % (version, name))

    if epsilon is not None:
        mmwrite("./out/ex%s_%s_epsilon.mtx" % (version, name.lower()),
                epsilon,
                "Experiment %s %s exploration rates." % (version, name))
Esempio n. 22
0
    def buildBaseDB(self, verticesPath):

        mainDir = os.path.dirname(verticesPath)
        verticesFileName = os.path.basename(verticesPath)
        targetsDBPath = os.path.join(mainDir,self.targetDBFolder)

        if "_vertices" not in verticesFileName:
            print "Error: the vertices file name must have the form of *_vertices.dat"
            return

        if not os.path.isdir(targetsDBPath):
            print "Error: targets folder %s not found"%(targetsDBPath)
            return

        base_file = os.path.join(mainDir, verticesFileName.replace("_vertices","_base"))
	vertices,lookup  = buildbase.read_vertices(verticesPath)

	print "Loading targets..."
	rb,target_names = buildbase.load_targets(targetsDBPath,vertices,lookup)
	print "Making base..."
	base,back = buildbase.make_base(rb,self.cutOff)
	del rb
	print "Saving base..."
	f = open(base_file,'w')
	mmwrite(f,base)
	f.close()
	del base
        print "Saved in %s"%(base_file)
Esempio n. 23
0
def make_author_vectors(crawl_fname, doc_vec_fname, auth_vec_fname):
    docs = np.load(doc_vec_fname)
    doc_vecs = docs["vectors"][()]
    # Convert to LIL, because modifying CSR is slow
    doc_vecs = doc_vecs.tolil()
    
    # Create mapping from label (=DOI) to row number (=doc vector)  
    doi2n = dict((l,i) for i,l in enumerate(docs["labels"]))
    
    # Collect authors         
    tree = etree.parse(crawl_fname)
    authors = np.array(list(set(tree.xpath("//author/text()"))))

    # Create empty author vectors
    shape = (len(authors), doc_vecs.shape[1])
    auth_vecs = sp.lil_matrix(shape)     
    
    # Create mapping from authors to row number (=author vector)
    auth2n = dict((a,i) for i,a in enumerate(authors))
    
    ## author to group mapping
    ##auth2group = {}
    
    # Fill author vectors by adding doc vectors 
    for item in tree.findall("//item"):
        author = item.find("author").text
        ##group = item.find("group")
        ##auth2group[author] = group
        url = item.find("url").text
        query = urlparse.urlparse(url).query
        doi = urlparse.parse_qs(query)["doi"][0]
        log.debug(u"DOI={} author={}".format(doi, author))
        
        try:
            auth_vecs[auth2n[author]] += doc_vecs[doi2n[doi]]
        except KeyError:
            log.warning(u"No document with DOI={} for author {}".format(
                doi, author))
            
    auth_vecs = auth_vecs.tocsr()
    
    ##group_labels = [auth2group[auth] for auth in authors]
           
    log.info("saving matrix in Numpy format to " + auth_vec_fname)
    np.savez(auth_vec_fname, 
             vectorizer=docs["vectorizer"],
             vectors=auth_vecs,
             author_labels=authors,
             ##group_labels=group_labels
             ) 
    
    base_fname = splitext(auth_vec_fname)[0]
    
    mm_fname = base_fname + ".mtx"
    log.info("saving matrix in Matrix Market format to " + mm_fname)
    mmwrite(mm_fname, auth_vecs, "IDIScape document vectors", "integer")
    
    label_fname = base_fname + "_labels.txt"
    log.info("saving labels to " + label_fname)
    open(label_fname, "w", "utf8").write(u"\n".join(authors))    
Esempio n. 24
0
def save_new_ref(filename, data):
    """ Saves a new version of the reference data, and backs up the old """
    
    ext = filename.split('.')[-1]
    
    if (data == None):
        print("WARNING: Error generating file: %s" % filename)
        print("Skipped... try again.")
        return
    
    if os.path.exists(filename):
        os.system( 'mv %s %s' % (filename, BACKUP_DIR) )
    
    if ext in ['h5', 'lh5']:
        if scipy.sparse.issparse(data):
            data = data.toarray()
        Serializer.SaveData(filename, data)
    elif ext == 'mtx':
        io.mmwrite(filename, data)
    elif ext == 'pkl':
        f = open(filename, 'w')
        pickle.dump(f, data)
        f.close()
    else:
        raise ValueError('Could not understand extension (.%s) for %s' % (ext, filename))
    
    return
Esempio n. 25
0
    def test1D2():
        spl = splineRefMat(DIM_1D)
    #    list_r = list(np.random.random(20))
        list_r = [0.1,0.2,0.3]

        nx = 3
        px = 2
        geo = line(n=[nx], p=[px])

        nrb     = geo[0]
        knots   = nrb.knots[0]
        n       = nrb.shape[0]
        p       = nrb.degree[0]
        P       = nrb.points

        M = spl.construct(list_r, p, n, knots)
        from scipy.io import mmwrite
        mmwrite('M.mtx', M)
        R = M.dot(nrb.points[:,0])

        geo = line(n=[nx], p=[px])
        geo.refine(id=0, list_t=[list_r])
        nrb     = geo[0]
        P = np.asarray(nrb.points[:,0])

        assert(np.allclose(P,R))
        print("test1D2: OK")
def main111():
  if 1:
    G = nx.read_edgelist(infname)
    print nx.info(G)
    # Graph adj matix
    A = nx.to_scipy_sparse_matrix(G)
    print type(A)
    from scipy import sparse, io
    io.mmwrite("Results/test.mtx", A)
    exit()
    # write to disk clustering coeffs for this graph
    snm.get_clust_coeff([G], 'orig', 'mmonth')
    # write to disk egienvalue
    snm.network_value_distribution([G], [], 'origMmonth')

  if 0:
    edgelist = np.loadtxt(infname, dtype=str, delimiter='\t')
    print edgelist[:4]
    idx = np.arange(len(edgelist))
    np.random.shuffle(idx)
    subsamp_edgelist = edgelist[idx[:100]]
    G = nx.Graph()
    G.add_edges_from([(long(x), long(y)) for x, y in subsamp_edgelist])

  # visualize this graph
  # visualize_graph(G)
  exit()

  G = nx.Graph()
  G.add_edges_from([(long(x), long(y)) for x, y in edgelist])
  print nx.info(G)
  print 'Done'
Esempio n. 27
0
def Pop(nbBasket, nbReco):
	data = load()
	###############################################################
	# CREATE MODELS
	###############################################################
	print 'Create the model based on the training set'
	
	modelPop = processing.PopRecoModel(data.getUserItemMatrix())
		
	###############################################################
	# SET RECOMMENDATION
	###############################################################
	if nbBasket == -1:
		evalPop = processing.Evaluation(modelPop, data.getBasketItemList(), nbReco)
	else :
		evalPop = processing.Evaluation(modelPop, data.getBasketItemList()[:nbBasket], nbReco)
	
	###############################################################
	# LAUNCH RECOMMENDATION + SAVE RESULTS
	###############################################################	
	t = time.time()
	evalPop.newEval()
	PopTime = time.time()-t
	mmwrite(resultFolder+'Pop',evalPop.perf) 
	
	print 'Pop Execution time:', PopTime
	print 'Performances :'
	print evalPop.testNames
	print evalPop.computePerf()
	evalPop.savePerf(resultFolder+'Pop.txt')
	return evalPop
Esempio n. 28
0
def process_dataset(name):
    prefix = 'dataset/' + name + '/' + name
    fea = np.loadtxt(prefix + '.fea')
    # transform link
    link_data = np.loadtxt(prefix + '.link')
    link_data = link_data - 1
    reverse_link_data = np.append(link_data[:, 1][:,np.newaxis], link_data[:, 0][:,np.newaxis], axis=1)
    link_data = np.append(link_data, reverse_link_data, axis=0)
    weight = [1]*link_data.shape[0]
    num_inst = fea.shape[0]
    link = sparse.csr_matrix((weight, link_data.transpose()), shape=(num_inst, num_inst))
    # transform label
    gnd = np.loadtxt(prefix + '.gnd')
    lb = preprocessing.LabelBinarizer()
    label = lb.fit_transform(gnd)
    label = label.astype(np.float)
    # use max component
    g = nx.Graph(link)
    mc = nx.connected_components(g)[0]
    link = link[mc, :][:, mc]
    label = label[mc, :]
    fea = fea[mc, :]
    # save
    np.save(prefix + '_fea', fea)
    mmwrite(prefix + '_link', link)
    np.save(prefix + '_label', label)
    np.save(prefix + '_mc', mc)
    return fea, link, label
Esempio n. 29
0
def df_to_sparse (jour,df_in, Fam, col): 
    
    # INPUTS:
    # num = jour considéré
    # df_in = nom du dataframe d'entrée à traiter
    # Fam = famille considéré 
    # col = colonne à conserver du data frame d'entrée vers celui de sortie (si
    # col = 0 on prend tout le dataframe)
    
    # OUTPUT:
    # Il n'y a pas d'output, le fichier est enregistré sous format de sparse
    # matrix
    
    # FONCTION:
    dossier = str(jour) +'.03.2013'
    path_entree = 'C:\Users\lbn\Documents\\data_frames\\'
    path_sortie = 'C:\Users\lbn\Documents\\data_frames\\sparse_data\\'
    doc_entree = path_entree + dossier +'\\'+df_in + str(jour) +'.pickle'
    doc_sortie = path_sortie + dossier +'\\'+df_in+ str(jour)+Fam
    
    data = read_pickle(doc_entree)
    if type(col) == list :
        data = csr_matrix(data.iloc[:,col], dtype=np.int8) 
    else :
        data = csr_matrix(data, dtype=np.int8) 
    io.mmwrite(doc_sortie, data)

    print('Traduction effectuee de:', df_in+ str(jour)+Fam)
Esempio n. 30
0
def CondProb(alpha, nbBasket, nbReco):
	data = load()
	###############################################################
	# CREATE MODELS
	###############################################################
	print 'Create the model based on the training set'
	
	modelCondProb = processing.CondProbRecoModel(data.getUserItemMatrix(), alpha)
		
	###############################################################
	# SET RECOMMENDATION
	###############################################################
	if nbBasket == -1:
		evalCondProb = processing.Evaluation(modelCondProb, data.getBasketItemList(), nbReco)
	else :
		evalCondProb = processing.Evaluation(modelCondProb, data.getBasketItemList()[:nbBasket], nbReco)
	
	###############################################################
	# LAUNCH RECOMMENDATION + SAVE RESULTS
	###############################################################	
	t = time.time()
	evalCondProb.newEval()
	CondProbTime = time.time()-t
	mmwrite(resultFolder+'CondProb_a%s_nb%s'%(alpha, nbBasket),evalCondProb.perf) 
	
	print 'Cond Prob Execution time:', CondProbTime
	print 'Performances :'
	print evalCondProb.testNames
	print evalCondProb.computePerf()
	evalCondProb.savePerf(resultFolder+'CondProb_a%s_nb%s.txt'%(alpha, nbBasket))
	return evalCondProb
Esempio n. 31
0
def process_SVD2(inputFileName, outputFileName, n, p, showError):
    """
    Perform SVD2.
    """
    mat, rowids = loadMatrix(inputFileName)
    X = mat.tocsc()
    ut, s, vt = sparsesvd(X, n)
    A = np.dot(np.dot(ut.T, np.diag(s**p)), vt)
    saveMatrix(A, rowids, outputFileName)
    mmwrite("%s.ut" % inputFileName, ut)
    np.savetxt("%s.s" % inputFileName, s)
    mmwrite("%s.vt" % inputFileName, vt)

    if showError:
        Xnorm = np.linalg.norm(X.todense(), ord='fro')
        Error = np.linalg.norm((X - A), ord='fro')
        rate = (100 * Error) / Xnorm
        print "Approximation Error Percentage = %f%%" % rate
        print "Frobenius norm of the original matrix =", Xnorm
        print "Frobenius norm of the error matrix =", Error
    pass
Esempio n. 32
0
def peak_count_matrix(atac_inter_bed, out_prefix):
    # first three columns is 'chr','start','end'
    # last column is barcode
    inter_peak = pd.read_csv(atac_inter_bed, header=None, sep='\t')
    inter_peak.columns = ['chr', 'start', 'end'
                          ] + [''] * (inter_peak.shape[1] - 4) + ['barcode']
    #inter_peak['peak']=inter_peak['name'].apply(lambda x: x.split('/')[-1])
    inter_peak['peak'] = inter_peak.apply(
        lambda x: x['chr'] + ':' + str(x['start']) + '-' + str(x['end']),
        axis=1)
    bc_peak_counts = inter_peak.groupby(['barcode', 'peak']).size()
    df = pd.DataFrame(bc_peak_counts)
    df.reset_index(inplace=True)
    new_df = df.pivot(index='peak', columns='barcode', values=0)
    new_df.columns.name = None
    new_df.index.name = None
    new_df = new_df.fillna(0)
    new_df = new_df.astype('int')
    mmwrite(out_prefix + 'count.mtx', csr_matrix(new_df))
    np.savetxt(out_prefix + 'peaks.txt', new_df.index.values, fmt="%s")
    np.savetxt(out_prefix + 'barcodes.txt', new_df.columns.values, fmt="%s")
Esempio n. 33
0
def prepare_bow_matrix(labeled) -> None:
    """Function to prepare the BOW matrix."""
    savedir = Path(config['data']['save_path'])
    cvect = CountVectorizer(strip_accents='ascii', min_df=2)

    cvect.fit((savedir / 'train.txt').read_text().splitlines())
    if labeled:
        savedir = savedir / 'labeled'

    train_dbyw = cvect.transform(
        (savedir / 'train.txt').read_text().splitlines(),
    )
    valid_dbyw = cvect.transform(
        (savedir / 'valid.txt').read_text().splitlines(),
    )
    test_dbyw = cvect.transform(
        (savedir / 'test.txt').read_text().splitlines(),
    )

    mmwrite(str(savedir / 'train.mtx'), train_dbyw)
    mmwrite(str(savedir / 'valid.mtx'), valid_dbyw)
    mmwrite(str(savedir / 'test.mtx'), test_dbyw)

    with (savedir / 'vocab').open('w', encoding='utf-8') as fvpw:
        fvpw.write('\n'.join(cvect.vocabulary_))

    with (savedir / 'mtx.flist').open('w') as flpw:
        flpw.write('{0}\n'.format(savedir.resolve() / 'train.mtx'))
        flpw.write('{0}\n'.format(savedir.resolve() / 'valid.mtx'))
        flpw.write('{0}\n'.format(savedir.resolve() / 'test.mtx'))
Esempio n. 34
0
def expression_matrix(df, validated_barcodes, outdir, sample, gtf_file):

    matrix_10X_dir = f"{outdir}/{sample}_matrix_10X/"
    matrix_table_file = f"{outdir}/{sample}_matrix.tsv.gz"
    if not os.path.exists(matrix_10X_dir):
        os.mkdir(matrix_10X_dir)

    df.loc[:, 'mark'] = 'UB'
    df.loc[df['Barcode'].isin(validated_barcodes), 'mark'] = 'CB'

    CB_total_Genes = df.loc[df['mark'] == 'CB', 'geneID'].nunique()
    CB_reads_count = df.loc[df['mark'] == 'CB', 'count'].sum()
    reads_mapped_to_transcriptome = df['count'].sum()

    table = df.loc[df['mark'] == 'CB', :].pivot_table(
        index='geneID', columns='Barcode', values='UMI',
        aggfunc=len).fillna(0).astype(int)

    id_name = gene_convert(gtf_file)
    id = table.index.to_series()
    name = id.apply(lambda x: id_name[x])
    genes = pd.concat([id, name], axis=1)
    genes.columns = ['gene_id', 'gene_name']

    # write 10X matrix
    table.columns.to_series().to_csv(f'{matrix_10X_dir}/barcodes.tsv',
                                     index=False,
                                     sep='\t')
    genes.to_csv(f'{matrix_10X_dir}/genes.tsv',
                 index=False,
                 header=False,
                 sep='\t')
    mmwrite(f'{matrix_10X_dir}/matrix', csr_matrix(table))

    # convert id to name; write table matrix
    table.index = name
    table.index.name = ""
    table.to_csv(matrix_table_file, sep="\t", compression='gzip')

    return (CB_total_Genes, CB_reads_count, reads_mapped_to_transcriptome)
def load_graph(name):

    dir = 'data/'
    try:
        if isinstance(name, str):
            meshname = dir + name
        else:
            meshname = dir + names[name]

        mesh = loadmat(meshname)
    except IOError as e:
        print 'Matrix market file : %s.mtx not available...downloading' % meshname
        url = base % (meshname, meshname)
        response = urllib2.urlopen(url)
        graph = response.read()
        adj_lists = [map(int, a.split()) for a in graph.splitlines() if a]

        num_nodes, num_edges = adj_lists[0]
        vertex_degrees = [len(edges) for edges in adj_lists[1:]]
        node_lists = [
            itertools.repeat(i, n)
            for i, n in zip(range(num_nodes), vertex_degrees)
        ]
        I = numpy.array(list(itertools.chain(*node_lists)))
        J = numpy.array(list(itertools.chain(*adj_lists[1:]))) - 1
        V = numpy.ones(2 * num_edges)

        G = coo_matrix((V, (I, J)), shape=(num_nodes, num_nodes))
        mmwrite(dir + meshname, G)

        G_nx = make_graph(G)
        pos = nx.spring_layout(G_nx, iterations=200)
        x = numpy.array([pos[i][0] for i in range(G.shape[0])])
        y = numpy.array([pos[i][1] for i in range(G.shape[0])])
        V = numpy.vstack((x, y)).T
        E = numpy.vstack((G.row, G.col)).T
        mesh = {'V': V, 'E': E}
        savemat(dir + meshname, mesh)

    return mesh
Esempio n. 36
0
def inference(dataloader, net, criterion, opt, OutputDir):
    net.eval()
    for i, (sample_idx, annotation, adj_matrix, label, mask) in enumerate(dataloader, 0):
        padding = torch.zeros(opt.batchSize, opt.n_node, opt.L, opt.state_dim - opt.annotation_dim).double()
        init_input = torch.cat((annotation, padding), 3)

        if opt.cuda:
            adj_matrix = adj_matrix.cuda()
            annotation = annotation.cuda()
            init_input = init_input.cuda()
            label = label.cuda()
            mask = mask.cuda()

        adj_matrix = Variable(adj_matrix)
        annotation = Variable(annotation)
        init_input = Variable(init_input)
        target = Variable(label)
        mask = Variable(mask)

        output = net(init_input)
        output = output.argmax(axis=2)[:, :, np.newaxis]

        # 予測結果とラベルを保存
        os.makedirs(OutputDir + "/output", exist_ok=True)
        for batch in range(opt.batchSize):
            p = output.detach().numpy()[batch]
            t = target[batch].numpy()
            m = mask[batch].numpy()
            mmwrite(OutputDir + "/output/pred" + str(sample_idx.numpy()[batch]), lil_matrix(p))
            mmwrite(OutputDir + "/output/true" + str(sample_idx.numpy()[batch]), lil_matrix(t))
            mmwrite(OutputDir + "/output/mask" + str(sample_idx.numpy()[batch]), lil_matrix(m))
Esempio n. 37
0
def cover(socp_data, N):
    """stacks the socp data and partitions it into N
    local dicts describing constraints R <= s"""
    if not settings.paths['mondriaan']:
        raise Exception(
            "Please provide a path to mondriaan: settings.paths['mondriaan'] = PATH.")

    n = socp_data['c'].shape[0]

    # form the Laplacian and use pymetis to partition
    L = form_laplacian(socp_data)
    io.mmwrite("mondriaan.mtx", L)

    import subprocess
    outpath = "mondriaan.mtx-P%d" % N
    proc = subprocess.Popen(
        [settings.paths['mondriaan'], "mondriaan.mtx", str(N), "0.05"])
    proc.wait()

    with open(outpath, "r") as f:
        f.readline()    # ignore comments
        f.readline()    # ignore comments

        # basic info about the matrix
        m, _, _, _ = f.readline().strip().split(" ")
        pstart = []
        # read the starting index of the partition
        for i in xrange(N + 1):
            pstart.append(int(f.readline()))
        part_vert = np.zeros(int(m), dtype=np.int)
        count = 0
        part = 0
        for i in xrange(N):
            while count < pstart[i + 1]:
                (row, col, val) = f.readline().strip().split(" ")
                part_vert[int(row) - 1] = part
                count += 1
            part += 1

    return part_vert[n:]
Esempio n. 38
0
def inference(dataloader, opt, OutputDir, Attribute_idx):
    for i, (sample_idx, annotation, adj_matrix, label,
            mask) in enumerate(dataloader, 0):
        target = Variable(label)
        mask = Variable(mask)
        output = annotation[:, :, -1][:, :, Attribute_idx]
        output = output.argmax(axis=2)[:, :, np.newaxis]

        for batch in range(opt.batchSize):
            ts = int(sample_idx[batch].numpy())
            output[batch][pred_binary[ts] > threshold] = torch.LongTensor(
                pred_transfer[ts][pred_binary[ts] > threshold])

        # 予測結果とラベルを保存
        os.makedirs(OutputDir + "/output", exist_ok=True)
        for batch in range(opt.batchSize):
            p = output.detach().numpy()[batch]
            t = target[batch].numpy()
            m = mask[batch].numpy()
            mmwrite(
                OutputDir + "/output/pred" + str(sample_idx.numpy()[batch]),
                lil_matrix(p))
            mmwrite(
                OutputDir + "/output/true" + str(sample_idx.numpy()[batch]),
                lil_matrix(t))
            mmwrite(
                OutputDir + "/output/mask" + str(sample_idx.numpy()[batch]),
                lil_matrix(m))
Esempio n. 39
0
 def __init__(self, programEntities):
     nusers = len(programEntities.userIndex.keys())
     self.numFriends = np.zeros((nusers))
     self.userFriends = ss.dok_matrix((nusers, nusers))
     fin = open("user_friends.csv", 'rb')
     fin.readline()  # skip header
     ln = 0
     for line in fin:
         if ln % 200 == 0:
             print("Loading line: ", ln)
         cols = line.strip().split(",")
         user = cols[0]
         if programEntities.userIndex.has_key(user):
             friends = cols[1].split(" ")
             i = programEntities.userIndex[user]
             self.numFriends[i] = len(friends)
             for friend in friends:
                 if programEntities.userIndex.has_key(friend):
                     j = programEntities.userIndex[friend]
                     # the objective of this score is to infer the degree to
                     # and direction in which this friend will influence the
                     # user's decision, so we sum the user/event score for
                     # this user across all training events.
                     eventsForUser = programEntities.userEventScores.getrow(
                         j).todense()
                     score = eventsForUser.sum() / np.shape(
                         eventsForUser)[1]
                     self.userFriends[i, j] += score
                     self.userFriends[j, i] += score
         ln += 1
     fin.close()
     # 归一化数组
     sumNumFriends = self.numFriends.sum(axis=0)
     self.numFriends = self.numFriends / sumNumFriends
     sio.mmwrite("UF_numFriends", np.matrix(self.numFriends))
     self.userFriends = normalize(self.userFriends,
                                  norm="l1",
                                  axis=0,
                                  copy=False)
     sio.mmwrite("UF_userFriends", self.userFriends)
Esempio n. 40
0
def vector_word():

    with open(
            'I:\MeachineLearnProject\SpamMessage-LR-Twt/RawData/train_content.json',
            'r') as f:
        content = json.load(f)
    with open(
            'I:\MeachineLearnProject\SpamMessage-LR-Twt/RawData/train_label.json',
            'r') as f:
        label = json.load(f)
    '''
        vec_count = MessageCountVectorizer(min_df=2, max_df=0.8)
        data_count = vec_count.fit_transform(content)
        name_count_feature = vec_count.get_feature_names()
    '''
    content_sub = content[0:100000]
    label_sub = label[0:100000]
    return content_sub, label_sub
    #print(content_sub)
    #vec_tfidf = TfidfVectorizer(min_df=2, max_df=0.8)
    #vec_tfidf = TfidfVectorizer()
    #data_tfidf = vec_tfidf.fit_transform(content_sub)
    #weight = vec_tfidf.fit_transform(content_sub).toarray()
    #print(data_tfidf)
    #name_tfidf_feature = vec_tfidf.get_feature_names()
    #print(name_tfidf_feature)

    #DecisionTreeClassifyTfidf(data_tfidf,,name_tfidf_feature)
    io.mmwrite(
        'I:\MeachineLearnProject\SpamMessage-LR-Twt/Data/word_vector_sub.mtx',
        data_tfidf)

    with open(
            'I:\MeachineLearnProject\SpamMessage-LR-Twt/Data/train_label_sub.json',
            'w') as f:
        json.dump(label, f)
    with open(
            'I:\MeachineLearnProject\SpamMessage-LR-Twt/Data/vector_type_sub.json',
            'w') as f:
        json.dump(name_tfidf_feature, f)
Esempio n. 41
0
def get_debug(data):
    full_train = sio.mmread('data/%s_train.mtx' % data).tocsr()
    (nu, nm) = full_train.shape

    print 'sampling'
    debug_mids = sample(range(nm), nm / 5)
    debug_uids = sample(range(nu), nu / 5)

    debug = full_train[debug_uids][:, debug_mids].tocoo()
    nr = debug.nnz
    train_ids, _, test_ids = sample_split(nr)

    # build matrix from given indices
    print 'writing debug_train'
    debug_train = coo_matrix(
        (debug.data[train_ids], (debug.row[train_ids], debug.col[train_ids])),
        debug.shape)
    sio.mmwrite('data/%s_debug_train.mtx' % data, debug_train)
    print 'writing debug_test'
    debug_test = coo_matrix(
        (debug.data[test_ids], (debug.row[test_ids], debug.col[test_ids])),
        debug.shape)
    sio.mmwrite('data/%s_debug_test.mtx' % data, debug_test)

    # build movie mtx from debug_mids
    print 'movie debug'
    movies = sio.mmread('data/movies.mtx').tocsr()
    movies_debug = movies[debug_mids]
    sio.mmwrite('data/movies_%s_debug.mtx' % data, movies_debug)

    return debug, debug_train, debug_test, movies_debug
Esempio n. 42
0
  def __init__(self, programEntities, psim=ssd.correlation, csim=ssd.cosine):
    cleaner = DataCleaner()
    fin = open("../Data/events.csv", 'rb')
    fin.readline() # skip header
    nevents = len(programEntities.eventIndex.keys())
    self.eventPropMatrix = ss.dok_matrix((nevents, 7))
    self.eventContMatrix = ss.dok_matrix((nevents, 100))
    ln = 0
    for line in fin.readlines():
#      if ln > 10:
#        break
      cols = line.strip().split(",")
      eventId = cols[0]
      if programEntities.eventIndex.has_key(eventId):
        i = programEntities.eventIndex[eventId]
        self.eventPropMatrix[i, 0] = cleaner.getJoinedYearMonth(cols[2]) # start_time
        self.eventPropMatrix[i, 1] = cleaner.getFeatureHash(cols[3]) # city
        self.eventPropMatrix[i, 2] = cleaner.getFeatureHash(cols[4]) # state
        self.eventPropMatrix[i, 3] = cleaner.getFeatureHash(cols[5]) # zip
        self.eventPropMatrix[i, 4] = cleaner.getFeatureHash(cols[6]) # country
        self.eventPropMatrix[i, 5] = cleaner.getFloatValue(cols[7]) # lat
        self.eventPropMatrix[i, 6] = cleaner.getFloatValue(cols[8]) # lon
        for j in range(9, 109):
          self.eventContMatrix[i, j-9] = cols[j]
        ln += 1
    fin.close()
    self.eventPropMatrix = normalize(self.eventPropMatrix,
        norm="l1", axis=0, copy=False)
    sio.mmwrite("../Models/EV_eventPropMatrix", self.eventPropMatrix)
    self.eventContMatrix = normalize(self.eventContMatrix,
        norm="l1", axis=0, copy=False)
    sio.mmwrite("../Models/EV_eventContMatrix", self.eventContMatrix)
    # calculate similarity between event pairs based on the two matrices    
    self.eventPropSim = ss.dok_matrix((nevents, nevents))
    self.eventContSim = ss.dok_matrix((nevents, nevents))
    for e1, e2 in programEntities.uniqueEventPairs:
      i = programEntities.eventIndex[e1]
      j = programEntities.eventIndex[e2]
      if not self.eventPropSim.has_key((i,j)):
        epsim = psim(self.eventPropMatrix.getrow(i).todense(),
          self.eventPropMatrix.getrow(j).todense())
        self.eventPropSim[i, j] = epsim
        self.eventPropSim[j, i] = epsim
      if not self.eventContSim.has_key((i,j)):
        ecsim = csim(self.eventContMatrix.getrow(i).todense(),
          self.eventContMatrix.getrow(j).todense())
        self.eventContSim[i, j] = epsim
        self.eventContSim[j, i] = epsim
    sio.mmwrite("../Models/EV_eventPropSim", self.eventPropSim)
    sio.mmwrite("../Models/EV_eventContSim", self.eventContSim)
Esempio n. 43
0
def load_stats(stats_f, vocab_f):
    """ Validate and load the input stats """

    stats = sio.mmread(stats_f)

    if vocab_f:
        vocab = read_simple_flist(vocab_f)

        # Check the compatibility of stats
        if stats.shape[1] == len(vocab):
            stats = stats.T
            print("Transposed the stats to make them word-by-doc.")
            sio.mmwrite(os.path.realpath(stats_f), stats)

        if stats.shape[0] != len(vocab):
            print(
                "Number of rows in stats should match with length of vocabulary."
            )
            print("Given stats:", stats.shape[0], "vocab. length:", len(vocab))
            sys.exit()

    return stats.tocsc()
Esempio n. 44
0
 def __init__(self, programEntities=None, isClean=True):
     print("统计活跃度初始化开始...")
     if isClean == False:
         raise ImportError("不进行统计活跃度会导致结果不准确!请改成True或不赋值成False")
     self.programEntities = programEntities
     self.num_events = len(self.programEntities.eventIndex.keys())
     self.eventPopularity = ss.dok_matrix((self.num_events, 1))
     with open('event_attendees.csv', 'rb') as reader:
         reader = str(reader.readline())
         for line in reader:
             cols = line.strip().split(",")
             eventId = cols[0]
             if eventId in self.programEntities.eventIndex:
                 i = self.programEntities.eventIndex[eventId]
                 self.eventPopularity[i, 0] = len(cols[1].split(" ")) - len(
                     cols[4].split(" "))
     self.eventPopularity = normalize(self.eventPopularity,
                                      norm="l1",
                                      axis=0,
                                      copy=False)
     sio.mmwrite("Event_Popularity", self.eventPopularity)
     print("统计活跃度结束...\n\n{}\n".format("*" * 200))
Esempio n. 45
0
def build_text_vector(fin, stopwords_pattern):
    """
  Create temporary fields by concatenating text columns to form
  a new column and generate a vector of term frequencies.
  """
    print "Building text vector..."
    fout = str.replace(fin, ".csv", ".text.mtx")
    if os.path.isfile(fout):
        return
    ftmp = str.replace(fin, ".csv", ".tmp")
    reader = csv.reader(open(fin, 'rb'))
    tmpwriter = open(ftmp, 'wb')
    ln = 0
    for row in reader:
        ln += 1
        if ln <= 1:
            continue  # skip header
        if ln % 1000 == 0:
            print "...(processed %d lines)" % (ln)
        title = row[1]
        full_description = extract_keywords(row[2], stopwords_pattern)
        loc_raw = row[3]
        tmpwriter.write(" ".join(
            [title, title, title, title, full_description, loc_raw, loc_raw]) +
                        "\n")
    tmpwriter.close()
    vectorizer = sft.CountVectorizer(max_features=1000)
    #  vectorizer = sft.TfidfVectorizer(
    #    charset_error="ignore",
    #    strip_accents="ascii",
    #    stop_words="english",
    #    max_features=100,
    #    use_idf=False)
    tmpreader = open(ftmp, 'rb')
    tdmatrix = vectorizer.fit_transform(tmpreader)
    os.remove(ftmp)
    writer = open(fout, 'wb')
    sio.mmwrite(writer, tdmatrix)
    writer.close()
Esempio n. 46
0
def save_randomforest_path(model_path, w2v_path):
    if os.path.exists(model_path):
        print("the model already exists.")
        clf = joblib.load(model_path)
    else:
        print("the model doesn't exists.")
        return None

    w2v_list = list()
    for root, dirs, files in os.walk(w2v_path):
        for file in files:
            if os.path.splitext(file)[1] == '.txt':
                w2v_list.append(file)
    w2v_list.sort()
    for w2v_name in w2v_list:
        filename = BasePath + "/w2v_corpus/" + w2v_name
        w2v_vec = np.loadtxt(filename)
        print(w2v_vec.shape)
        path_of_sample, _ = clf.decision_path(w2v_vec)
        save_file_path = BasePath + "/rf_path/" + "path_" + w2v_name.split(
            '.')[0] + ".mtx"
        io.mmwrite(save_file_path, path_of_sample)
Esempio n. 47
0
 def __init__(self, programEntities, sim=ssd.correlation):
     cleaner = DataCleaner()
     nusers = len(programEntities.userIndex.keys())
     fin = open("../data/users.csv", 'rb')
     colnames = fin.readline().strip().split(",")
     self.userMatrix = ss.dok_matrix((nusers, len(colnames) - 1))
     for line in fin:
         cols = line.strip().split(",")
         # 只考虑train.csv中出现的用户
         if programEntities.userIndex.has_key(cols[0]):
             i = programEntities.userIndex[cols[0]]
             ##将数据进行预处理再放进userMatrix矩阵中
             self.userMatrix[i, 0] = cleaner.getLocaleId(cols[1])
             self.userMatrix[i, 1] = cleaner.getBirthYearInt(cols[2])
             self.userMatrix[i, 2] = cleaner.getGenderId(cols[3])
             self.userMatrix[i, 3] = cleaner.getJoinedYearMonth(cols[4])
             self.userMatrix[i, 4] = cleaner.getCountryId(cols[5])
             self.userMatrix[i, 5] = cleaner.getTimezoneInt(cols[6])
     fin.close()
     # 归一化用户矩阵
     self.userMatrix = normalize(self.userMatrix,
                                 norm="l1",
                                 axis=0,
                                 copy=False)
     sio.mmwrite("US_userMatrix", self.userMatrix)
     # 计算用户相似度矩阵,之后会用到
     self.userSimMatrix = ss.dok_matrix((nusers, nusers))
     for i in range(0, nusers):
         self.userSimMatrix[i, i] = 1.0
     for u1, u2 in programEntities.uniqueUserPairs:
         i = programEntities.userIndex[u1]
         j = programEntities.userIndex[u2]
         if not self.userSimMatrix.has_key((i, j)):
             usim = sim(
                 self.userMatrix.getrow(i).todense(),
                 self.userMatrix.getrow(j).todense())
             self.userSimMatrix[i, j] = usim
             self.userSimMatrix[j, i] = usim
     sio.mmwrite("US_userSimMatrix", self.userSimMatrix)
def Users():
    pr = ProgramEntities()
    nusers = len(pr.userIndex)
    sim = ssd.correlation

    cleaner = DataCleaner()
    fin = open(r"D:\kaggle_data\event_recommendation\users.csv", 'r')
    colnames = fin.readline().strip().split(",")
    userMatrix = sparse.dok_matrix((nusers, len(colnames)-1))   # 稀疏矩阵
    for line in fin:
        cols = line.strip().split(",")
        # 只考虑train.csv中出现的用户
        if cols[0] in pr.userIndex.keys():
            i = pr.userIndex[cols[0]]
            userMatrix[i, 0] = cleaner.getLocaleId(cols[1])
            userMatrix[i, 1] = cleaner.getBirthYearInt(cols[2])
            userMatrix[i, 2] = cleaner.getGenderId(cols[3])
            userMatrix[i, 3] = cleaner.getJoinedYearMonth(cols[4])
            userMatrix[i, 4] = cleaner.getCountryId(cols[5])
            userMatrix[i, 5] = cleaner.getTimezoneInt(cols[6])
    fin.close()
    # print(userMatrix)

    # 归一化用户矩阵,并存起来
    userMatrix_N = normalize(userMatrix, norm='l1', axis=0)     # axis=0 沿着列归一化
    # sio.mmwrite("US_userMatrix", userMatrix_N)
    # 生成一个用户相似度矩阵
    userSimMatrix = sparse.dok_matrix((nusers, nusers))
    for i in range(nusers):
        userSimMatrix[i, i] = 1.0
    for u1,u2 in pr.uniqueUserPairs:
        i = pr.userIndex[u1]
        j = pr.userIndex[u2]
        if (i not in userSimMatrix.keys() and j not in userSimMatrix.keys()):
            usim = sim(userMatrix_N.getrow(i).todense(), userMatrix_N.getrow(j).todense())
            userSimMatrix[i, j] = usim
            userSimMatrix[j, i] = usim
    sio.mmwrite("US_userSimMatrix", userSimMatrix)
Esempio n. 49
0
def main():
    """
        Main entry point to script to perform spectral co-clustering.

        Returns:

        - `0` or `1` on success or failure respectively.
        - Saves `centroids`, `centroiddict`, `clusters` and `clusterdict` in \
                working dir.

    """
    parser = gen_args()
    args = parser.parse_args()
    sessionid = args.sessionid
    A = spio.mmread(args.A).tocsc()
    logger = logging.getLogger(__name__)
    logger.addHandler(logging.StreamHandler())
    if args.verbose:
        logger.setLevel(logging.DEBUG)
    if args.k:
        k = args.k
    spcc = SpectralCoClusterer(A, k, args.n, args.delta, \
                               args.randomcentroids, \
                               args.classical, args.verbose)
    result = spcc.run()
    clusters = result['clusters']
    centroids = result['centroids']
    centroid_dict = result['centroiddict']
    cluster_dict = result['clusterdict']
    cPickle.dump(clusters, open("clusters_" + sessionid + '.pck', 'w'))
    cPickle.dump(centroid_dict, open("centroid_dict_" + \
                                    sessionid + '.pck', 'w'))
    cPickle.dump(cluster_dict, open("cluster_dict_" + \
                                    sessionid + '.pck', 'w'))
    spio.mmwrite(open("centroids_" + sessionid + '.mtx', 'w'), \
                 centroids, comment="CSC Matrix", field='real')
    logger.info(" %d Clusters Generated ", len(clusters))
    return 0
def genHashes():
    with open('pickled_minhash/feature_matrix_binary_sample.npy', 'rb') as f:
        #size of feature_matrix_large: 1261 x 19043
        feature_matrix = np.load(f)
    with open('pickled_minhash/actual_jaccard_matrix_small.mtx', 'rb') as f:
        baseline = io.mmread(f)
    baseline = baseline.todok()

    jaccard_matrix = []

    #jaccard_matrix_pre is a list of arrays that contain non-zero indicies of each article in the corpus
    for i in feature_matrix[0:test_num]:
        indicies = np.flatnonzero(i)
        jaccard_matrix.append(indicies)
    k_vals = [16, 32, 64, 128, 256]
    for k in k_vals:

        #calculate minHash with k =16
        S = sparse.dok_matrix((len(jaccard_matrix), len(jaccard_matrix)))
        t0 = time.time()
        hashmap = {}
        for i in range(0, len(jaccard_matrix)):
            mh = MinHash(num_perm=k)
            for d in jaccard_matrix[i]:
                mh.digest(sha1(struct.pack("!I", d)))
            hashmap[i] = mh
        for i in range(0, len(jaccard_matrix)):
            m8_1 = hashmap[i]
            for j in range(0, i + 1):
                m8_2 = hashmap[j]
                estj = MinHash.jaccard(m8_1, m8_2)
                if estj != 0 and estj != 1:
                    S[i, j] = estj
        print("Time to calculate first %s estj (k=%s): %f" %
              (len(jaccard_matrix), k, time.time() - t0))
        with open('pickled_minhash/estj_' + str(k) + '_small.mtx', 'wb') as f:
            #size of feature_matrix_large: 1261 x 19043
            io.mmwrite(f, S)
Esempio n. 51
0
def save_sparse_matrix(data, fmt, filepath):
    """
    Save a scipy sparse matrix in the specified format. Row and column
    indices will be converted to 1-indexed if you specify a plain text
    format (tsv, csv, mm). Note that zero entries are guaranteed to be
    saved in tsv or csv format.

    Parameters
    ----------
    data : scipy sparse matrix to save
    fmt : str
        Specifies the file format to write:
        - tsv
        - csv
        - mm  (MatrixMarket)
        - npz (save as npz archive of numpy arrays)
        - fsm (mrec.sparse.fast_sparse_matrix)
    filepath : str
        The file to load.
    """
    if fmt == 'tsv':
        m = data.tocoo()
        with open(filepath, 'w') as out:
            for u, i, v in izip(m.row, m.col, m.data):
                print >> out, '{0}\t{1}\t{2}'.format(u + 1, i + 1, v)
    elif fmt == 'csv':
        m = data.tocoo()
        with open(filepath, 'w') as out:
            for u, i, v in izip(m.row, m.col, m.data):
                print >> out, '{0},{1},{2}'.format(u + 1, i + 1, v)
    elif fmt == 'mm':
        mmwrite(filepath, data)
    elif fmt == 'npz':
        savez(data.tocoo(), filepath)
    elif fmt == 'fsm':
        fast_sparse_matrix(data).save(filepath)
    else:
        raise ValueError('unknown output format: {0}'.format(fmt))
Esempio n. 52
0
def index(search_dir, index_dir):
    cmd = 'pdftotext "%s" %s/loog.txt'
    dirs, files = rsync.ls(search_dir)
    files = [(f, size) for (f, size) in files if '.pdf' in f]
    N = len(files)
    A = sps.lil_matrix((N, cols))
    print A.shape
    df_files = []
    for i, (f, size) in enumerate(files):
        file = f.replace("\\", "/")
        print file
        if ".pdf" in file:
            cmd2 = cmd % (f, os.environ['TEMP'])
            os.system(cmd2)
            lowers = open(
                "%s/loog.txt" %
                os.environ['TEMP']).read().decode("ISO-8859-1").lower()
            tokens = nltk.word_tokenize(lowers)
            tokens = stem_tokens(tokens)
            print tokens[:30]
            for token in tokens:
                A[i, hash(token) % cols] += 1
            df_files.append([file, size])

    df = A.copy()
    df[df > 0] = 1.
    df = np.array(df.sum(axis=0))
    idf = df.copy()
    idf[df.nonzero()] = np.log(N / df[df.nonzero()])
    io.mmwrite(index_dir + "/loogle_idf.mtx", idf)

    tf = A.copy().tocoo()
    tf.data = 1 + np.log(tf.data)
    tfidf = sps.csr_matrix(tf.multiply(idf))
    tfidf = normalize(tfidf, norm='l2', axis=1)
    io.mmwrite(index_dir + "/loogle_tfidf.mtx", tfidf)
    df_files = pd.DataFrame(df_files, columns=['file', 'size'])
    df_files.to_csv(index_dir + "/loogle_files.csv", index=None)
def main():
    usage = ""  # TODO
    parser = OptionParser(usage=usage)
    #parser.add_option("-a", "--a_descrip", action="store_true", help="This is a flat")
    parser.add_option("-o", "--out_dir", help="Output directory")
    (options, args) = parser.parse_args()

    dataset = args[0]
    out_dir = options.out_dir

    r = load_dataset.load_dataset(dataset, 'counts')
    the_exps = r[3]
    data_matrix = r[10]
    gene_ids = r[11]

    genes_df = pd.read_csv('genes.tsv', sep='\t', index_col=0, header=None)
    genes_df = genes_df.loc[gene_ids]

    with open(join(out_dir, 'matrix.mtx'), 'wb') as f:
        mmwrite(f, coo_matrix(data_matrix.T))
    with open(join(out_dir, 'barcodes.tsv'), 'w') as f:
        f.write('\n'.join(the_exps))
    genes_df.to_csv(join(out_dir, 'genes.tsv'), sep='\t', header=False)
Esempio n. 54
0
def calculate(words, feature, fname):
    e = np.zeros((len(words), len(words)))
    for row in range(0, len(words)):
        for column in range(row, len(words)):
            if row < column:
                tem = (np.dot(feature[row], feature[column]))\
                      / (np.linalg.norm(feature[row])*np.linalg.norm(feature[column]))
                e[row][column] = np.arccos(-tem)
                e[column][row] = e[row][column]
    for i in range(0, len(words)):
        kth_max = np.sort(e[i])
        zero_list = [0]
        e_row = e[i]
        e_row = np.where(e_row < kth_max[len(words) - 300], zero_list, e_row)
        e[i] = e_row
    for row in range(0, len(words)):
        for column in range(row, len(words)):
            if e[row][column] != e[column][row]:
                e[row][column] = 0
                e[column][row] = 0
    sparse_e = sparse.csr_matrix(e)
    io.mmwrite(fname, sparse_e)
    return
Esempio n. 55
0
def save_data(sco, outputpath):
    mmf = os.path.join(outputpath, 'expr_m.mtx')
    # with open(outputpath + 'expr_m.mtx', 'w') as f:
    #     mmwrite(f, sco.expression_matrix)
    mmwrite(mmf, csc_matrix(sco.expression_matrix))

    sc_info = []
    sc_info.append(['meta_info'])
    meta_info = sco.meta_info
    p = sco.processed
    p.append('save data')
    meta_info['processed'] = p
    meta_info = list([list(meta_info.keys()), list(meta_info.values())])
    sc_info.append(meta_info)

    sc_info[0].append("genes_list")
    sc_info.append(list(sco.gene_ref.get_list()))
    for i in sco.cell_info.data_names:
        sc_info[0].append(i)
        sc_info.append(list(sco.cell_info[i]))  

    with open(os.path.join(outputpath, 'info.json'), 'w') as f:
        json.dump(sc_info, f)
Esempio n. 56
0
    def test_sparse_formats(self):
        mats = []

        I = array([0, 0, 1, 2, 3, 3, 3, 4])
        J = array([0, 3, 1, 2, 1, 3, 4, 4])

        V = array([1.0, 6.0, 10.5, 0.015, 250.5, -280.0, 33.32, 12.0])
        mats.append(scipy.sparse.coo_matrix((V, (I, J)), shape=(5, 5)))

        V = array([
            1.0 + 3j, 6.0 + 2j, 10.50 + 0.9j, 0.015 + -4.4j, 250.5 + 0j,
            -280.0 + 5j, 33.32 + 6.4j, 12.00 + 0.8j
        ])
        mats.append(scipy.sparse.coo_matrix((V, (I, J)), shape=(5, 5)))

        for mat in mats:
            expected = mat.toarray()
            for fmt in ['csr', 'csc', 'coo']:
                fn = mktemp(dir=self.tmpdir)  # safe, we own tmpdir
                mmwrite(fn, mat.asformat(fmt))

                result = mmread(fn).toarray()
                assert_array_almost_equal(result, expected)
Esempio n. 57
0
    def test_gzip_py3(self):
        # test if fix for #2152 works
        try:
            # gzip module can be missing from Python installation
            import gzip
        except ImportError:
            return
        I = array([0, 0, 1, 2, 3, 3, 3, 4])
        J = array([0, 3, 1, 2, 1, 3, 4, 4])
        V = array([1.0, 6.0, 10.5, 0.015, 250.5, -280.0, 33.32, 12.0])

        b = scipy.sparse.coo_matrix((V, (I, J)), shape=(5, 5))

        mmwrite(self.fn, b)

        fn_gzip = "%s.gz" % self.fn
        with open(self.fn, 'rb') as f_in:
            f_out = gzip.open(fn_gzip, 'wb')
            f_out.write(f_in.read())
            f_out.close()

        a = mmread(fn_gzip).toarray()
        assert_array_almost_equal(a, b.toarray())
Esempio n. 58
0
    def test_bzip2_py3(self):
        # test if fix for #2152 works
        try:
            # bz2 module isn't always built when building Python.
            import bz2
        except ImportError:
            return
        I = array([0, 0, 1, 2, 3, 3, 3, 4])
        J = array([0, 3, 1, 2, 1, 3, 4, 4])
        V = array([1.0, 6.0, 10.5, 0.015, 250.5, -280.0, 33.32, 12.0])

        b = scipy.sparse.coo_matrix((V, (I, J)), shape=(5, 5))

        mmwrite(self.fn, b)

        fn_bzip2 = "%s.bz2" % self.fn
        with open(self.fn, 'rb') as f_in:
            f_out = bz2.BZ2File(fn_bzip2, 'wb')
            f_out.write(f_in.read())
            f_out.close()

        a = mmread(fn_bzip2).toarray()
        assert_array_almost_equal(a, b.toarray())
Esempio n. 59
0
    def export_matrix(self, filename, matrix_name=None, output_format='matlab', mu=None):
        """Save the matrix of the operator to a file.

        Parameters
        ----------
        filename
            Name of output file.
        matrix_name
            The name, the output matrix is given. (Comment field is used in
            case of Matrix Market output_format.) If `None`, the |Operator|'s `name`
            is used.
        output_format
            Output file format. Either `matlab` or `matrixmarket`.
        mu
            The |Parameter| to assemble the to be exported matrix for.
        """
        assert output_format in {'matlab', 'matrixmarket'}
        matrix = self.assemble(mu).matrix
        matrix_name = matrix_name or self.name
        if output_format == 'matlab':
            savemat(filename, {matrix_name: matrix})
        else:
            mmwrite(filename, matrix, comment=matrix_name)
Esempio n. 60
0
    def write_matrix_10X(self, df, matrix_dir):
        if not os.path.exists(matrix_dir):
            os.mkdir(matrix_dir)

        df_UMI = df.groupby(['geneID', 'Barcode']).agg({'UMI': 'count'})
        mtx = coo_matrix(
            (df_UMI.UMI, (df_UMI.index.codes[0], df_UMI.index.codes[1])))
        gene_id = df_UMI.index.levels[0].to_series()
        # add gene symbol
        gene_name = gene_id.apply(lambda x: self.gtf_dict[x])
        genes = pd.concat([gene_id, gene_name], axis=1)
        genes.columns = ['gene_id', 'gene_name']

        barcodes = df_UMI.index.levels[1].to_series()
        genes.to_csv(f'{matrix_dir}/{FEATURE_FILE_NAME}',
                     index=False,
                     sep='\t',
                     header=False)
        barcodes.to_csv(f'{matrix_dir}/{BARCODE_FILE_NAME}',
                        index=False,
                        sep='\t',
                        header=False)
        mmwrite(f'{matrix_dir}/{MATRIX_FILE_NAME}', mtx)