Esempio n. 1
0
def get_sparse_matrix(train=None, test=None, params=None, remove_numbers_function=True, debug=True, save=False, load=True, data_dir="data"):
    """
    Get sparse matrix form of the train and test set

    Parameters
    -------------------------
    Each input is numpy array:
    train, test, params: See the documentation of tf_idf function
    save: To save the train and test sprse matrices on . npz format
    load: To load the train and test sprse matrices from your local machine
    data_dir: Specify the ro specifi the data directory where the matrices are saved

    Returns:
    --------------------------
    train: train set in sparce marix form
    test: test set in sparce matrix form

    Example
    -------
        >>> train = pd.read_csv("data/train.csv")
        >>> test = pd.read_csv("data/test.csv")
        >>> # to create and save the train and test set
        >>> train_sparse, test_sparse = get_sparse_matrix(train, test, params=None, remove_numbers_function=True, debug=True, save=True, load=False)
        >>> # to load the sparse matrices from your local machine
        >>> train, test = get_sparse_matrix(load=True)

    """
    base_dir = data_dir + '/output/'

    if not os.path.exists(base_dir):
        os.makedirs(base_dir)

    name_train = base_dir + 'sparce_train.npz'
    name_test = base_dir + 'sparce_test.npz'

    if load:
        if os.path.exists(name_train) and os.path.exists(name_test):
            train, test = load_sparse_csr(name_train), load_sparse_csr(name_test)
        else:
            raise ValueError("You asked to load the features but they were not found"
                             + "at the specified location: \n{}\n{}".format(name_train, name_test))

    else:
        print('Computing the sparse matrixes, this will take a while...!')
        train, test = tf_idf(train, test, params, remove_numbers_function, debug)

    if save:
        print('Saving train file as {}'.format(name_train))
        save_sparse_csr(name_train, train)
        print('Saving test file as {}'.format(name_test))
        save_sparse_csr(name_test, test)

    return train, test
Esempio n. 2
0
def calc_weighted_modularity_langs(A_filename, indice2word_filename, langs):
    """

    :param A_filename:
    :param indice2word_filename:
    :param langs: prefix tags for each word in the .txt embedding file
    :return: normalized modularity
    """
    A = load_sparse_csr(A_filename).todok()
    indice2word = pickle.load(open(indice2word_filename, "rb"))
    num_community = len(langs)
    a_l = [0] * num_community
    e_ll = [0] * num_community
    e_ll_Q_max = [0] * num_community
    lang_to_indice = {lang: i
                      for i, lang in enumerate(langs)
                      }  # assigns a community ID for each language

    degree_list, two_m = get_degrees(A)
    """
    a_l computation
    """
    for node_id in range(A.shape[0]):
        node_word = indice2word[node_id]
        community = lang_to_indice[node_word[:3]]
        k_i = degree_list[node_id]
        a_l[community] += k_i
    """
    e_ll computation
    """
    for processed, e in enumerate(A.items()):
        node1, node2 = e[0]
        node1_word = indice2word[node1]
        node2_word = indice2word[node2]
        community_id = lang_to_indice[node1_word[:3]]

        e_ll_Q_max[community_id] += 1  # 1 is maximum possible weight
        if node1_word[:3] == node2_word[:3]:
            e_ll[community_id] += A[node1, node2]

    check_e_ii_s_Q_max = 0
    for i in range(len(a_l)):
        check_e_ii_s_Q_max += e_ll_Q_max[i]
    assert (check_e_ii_s_Q_max == two_m)

    e_ll = list(map(lambda x: x / two_m, e_ll))
    e_ll_Q_max = list(map(lambda x: x / two_m, e_ll_Q_max))
    a_l = list(map(lambda x: x / two_m, a_l))
    print("mean degree=%.4f" % (np.mean(degree_list)))

    Q = 0
    Q_max = 0
    for i in range(len(a_l)):
        Q += e_ll[i] - (a_l[i]**2)
        Q_max += e_ll_Q_max[i] - (a_l[i]**2)
    print("Weighted Q=%.3f" % Q)
    print("Weighted Q_max=%.3f" % Q_max)
    print("Normalized Q=%.3f" % (Q / Q_max))
    return Q / Q_max
Esempio n. 3
0
    def __init__(self, db_path=DB_PATH, mat_path=MAT_PATH):
        self.db_path = db_path
        self.mat_path = mat_path
        self.db = DocDB(db_path=self.db_path)
        self.mat, metadata = utils.load_sparse_csr(self.mat_path)

        # doc_freqs, hash_size, ngram, doc_dict
        for k, v in metadata.items():
            setattr(self, k, v)
Esempio n. 4
0
 def __init__(self, strict=True, num_workers=8, tokenize=False, topn=100):
     """
     Args:
         tfidf_path: path to saved model file
         strict: fail on empty queries or continue (and return empty result)
     """
     # Load from disk
     self.topn = topn
     self.tokenize = tokenize
     self.num_workers = num_workers
     tfidf_path = DATA_DIR + "/corpus-tfidf-ngram=1-hash=16777216-tokenizer=spacy.npz"
     logger.info('Loading %s' % tfidf_path)
     matrix, metadata = utils.load_sparse_csr(tfidf_path)
     self.doc_mat = matrix
     self.ngrams = metadata['ngram']
     self.hash_size = metadata['hash_size']
     self.tokenizer = tokenizers.get_class(metadata['tokenizer'])()
     self.doc_freqs = metadata['doc_freqs'].squeeze()
     self.doc_dict = metadata['doc_dict']
     self.num_docs = len(self.doc_dict[0])
     self.strict = strict
def initCosSimQuery():

    global term2id
    global invertedIndex
    global W
    global docCount
    global termCount
    global idf

    term2tidFile = open("term2tid.json", "r")
    indexFile = open("invertedIndex.json", "r")

    term2id = json.load(term2tidFile)
    invertedIndex = json.load(indexFile)

    term2tidFile.close()
    indexFile.close()

    W = utils.load_sparse_csr("weightMatrix.npz")
    idf = np.load('idf.npy')
    docCount = np.size(W, 1)
    termCount = len(term2id)
def initCosSimQuery():
	
	global term2id
	global invertedIndex
	global W
	global docCount
	global termCount
	global idf
	
	term2tidFile = open("term2tid.json", "r")
	indexFile = open("invertedIndex.json", "r")
	
	term2id = json.load(term2tidFile)
	invertedIndex = json.load(indexFile)
	
	term2tidFile.close()
	indexFile.close()
	
	W = utils.load_sparse_csr("weightMatrix.npz")
	idf = np.load('idf.npy')
	docCount = np.size(W, 1)
	termCount = len(term2id)
Esempio n. 7
0
 def _load_arrays(self, fn):
     zero_fn = fn + '.npz'
     bin_fn = fn + '.bin.npz'
     self.zero_sparse = load_sparse_csr(zero_fn)
     self.binary_sparse = load_sparse_csr(bin_fn)
Esempio n. 8
0
            print(
                'Loading the paragraph trained classifier trained on data processed by '
                'biggest gap thresholding mechanism ')
            classifier = load_pickle(config.classifier_par_biggest_gap)
            threshold = 0.91
            y_true = process_y(data, threshold_biggest_gap)
        else:
            print(
                'Loading the paragraph trained classifier trained on data processed by'
                ' threshold_half_max function')
            classifier = load_pickle(config.classifier_par_half_max)
            threshold = 0.39
            y_true = process_y(data, threshold_half_max)

        print("Loading x")
        x = load_sparse_csr(data['x'])
    else:
        threshold = 0.3

        vectorizer = load_pickle(config.vectorizer)
        binarizer = load_pickle(config.binarizer)

        print('Loading the classifier')
        classifier = load_pickle(config.classifier)

        corpus, topics = build_corpus_and_topics(config.data['test'])

        print('Transforming corpus by vectorizer')
        x = vectorizer.transform(corpus)
        print('Transforming article topics by binarizer')
        y_true = binarizer.transform(topics)
Esempio n. 9
0

def load_active_during_test():
    items_data = pd.read_csv("data_modified/item_profile.csv",
                             delim_whitespace=True,
                             dtype={'id': int, 'active_during_test': str},
                             usecols=['id', 'active_during_test'])

    return set(items_data[items_data.active_during_test == '1']['id'].values)


active_items = load_active_during_test()
# load data from csv
interactions_map = load_interactions()

urm_user = load_sparse_csr('urm_user_based_full.npz')
urm_item = load_sparse_csr('urm_item_based_full.npz')
urm_funk = load_sparse_csr('urm_funk_full.npz')

urm_user = normalize(urm_user)
urm_item = normalize(urm_item)
urm_funk = normalize(urm_funk)

# 0.024365000000000008 0.12 0.24
alpha = 0.3
beta = 0.3

# for beta in np.arange(start=0.28, stop=10, step=0.01):
estimated_urm = urm_item * alpha + urm_user * beta + urm_funk * (1.0 - alpha - beta)

# write recommendations
Esempio n. 10
0
    parser = argparse.ArgumentParser()
    parser.add_argument('ct_path',
                        type=str,
                        default=None,
                        help='Path to count matrices')
    parser.add_argument('out_dir',
                        type=str,
                        default=None,
                        help='Directory for saving output files')
    args = parser.parse_args()

    ct_files = [f for f in utils.iter_files(args.ct_path)]

    logger.info('Loading the zeroth count matrix...')
    mat, metadata = utils.load_sparse_csr(ct_files[0])

    DOC2IDX, doc_ids = metadata['doc_dict']

    for i in range(1, len(ct_files)):

        logger.info('Loading %ith count matrix...' % i)
        nxt_mat, nxt_metadata = utils.load_sparse_csr(ct_files[i])

        if metadata['hash_size'] != nxt_metadata['hash_size']:
            raise RuntimeError('hash_size not equal in %ith file' % i)
        if metadata['ngram'] != nxt_metadata['ngram']:
            raise RuntimeError('ngram not equal in %ith file' % i)

        logger.info('Merging...')
        mat = sp.hstack([mat, nxt_mat])
Esempio n. 11
0
                        default=None,
                        help='Path to count matrix file')
    parser.add_argument('out_dir',
                        type=str,
                        default=None,
                        help='Directory for saving output files')
    parser.add_argument('--model',
                        type=str,
                        default='tfidf',
                        help=('tfidf or pmi'))

    args = parser.parse_args()

    logger.info('Loading count matrix...')

    count_matrix, metadata = utils.load_sparse_csr(args.ct_path)

    logger.info('Making %s vectors...' % args.model)

    if args.model == 'tfidf':
        mat = get_tfidf_matrix(count_matrix)
    elif args.model == 'pmi':
        mat = get_pmi_matrix(count_matrix)
    else:
        raise RuntimeError('Model %s is invalid' % args.model)

    basename = os.path.splitext(os.path.basename(args.ct_path))[0]
    basename = ('%s-' % args.model) + basename

    if not os.path.exists(args.out_dir):
        logger.info("Creating data directory")
Esempio n. 12
0
import utils
import numpy as np
from scipy import sparse
from sklearn.externals import joblib

#root to this code is: /Google Drive/CIKM_AnalytiCup_2017/Code/CIKM_AnalytiCup_2017/
filename = 'features/features_title_desc_attributes_1_3_word_1_5_char.npz'
X = utils.load_sparse_csr(filename)
XX = utils.load_sparse_csr(filename.split('.')[-2] + '_valid.npz')
loaded = np.load(filename.split('.')[-2] + '_dict.npz')
words = loaded['words'].tolist()
chars = loaded['chars'].tolist()

feature_names = [
    'title.getLengths',
    'title.getLengthsByTerm',
    'title.getSpecialCharStat',
    'title.hasSpecialChar',
    'title.getUpperCharStat',
    'title.getNounStat',
    'title.getVerbStat',
    'title.getAdjectiveStat',
    'title.hasNumber',
    'title.getNumberStat',
    'title.hasTamilChar',
    'title.hasChineseChar',
    'title.getNonEnglishCharStat',
    'title.getColorStat',
    'title.getBrandStat(brands)',
    'title.getSyllableStat',
    'title.getPolysyllabStat',