def train_vectors(train_article_name, test_data, corpus, window=window, epochs=epochs, cd_data=cd_data): """ Trains the Word2Vec model using the test_data, then saves the vectors in the models directory. train_article_name -> String of file name excluding the path. test_data -> Fully tokenized read_article from ProcessArticle.full_tokenize(doc). corpus -> Corpus dictionary from ProcessArticle.generate_corpus(train_article.tolest) window -> Default window parameter from Word2Vec. epochs -> Default epochs parameter from Word2vec. cd_data -> data directory path as a string. """ w2v = Word2Vec(test_data, window=window) w2v.train(train_data, total_words=len(corpus), epochs=epochs) SaveLoad.save(w2v, cd_models + 'vectors.w2v') return w2v.wv
def main(): arg1 = sys.argv[1] one_train, abstract_train, seven_train, month_train = defaultdict( list), defaultdict(list), defaultdict(list), defaultdict(list) one_test, seven_test, month_test = defaultdict(list), defaultdict( list), defaultdict(list) # nltk.download('stopwords') print("start pre-processing the data") bigram = SaveLoad.load("data/phrase_xxx/big_phrase.pickle") trigram = SaveLoad.load("data/phrase_xxx/trig_phrase.pickle") label_one = pd.read_pickle("data/label_one_new.pickle") print("starting the training selecting phase") Ding(label_one, bigram, trigram) #Ding_abstract(label_one,bigram,trigram,types=arg1) '''
def load_tf_idf(): #https://radimrehurek.com/gensim/tut2.html ''' corpus = corpora.MmCorpus('G:\wiki_dump\wiki_en_corpus') corpus.serialize('G:\wiki_dump2\wiki_en_tf_idf', corpus, 'G:\wiki_dump2\id2word', 'G:\wiki_dump2\index') ''' corpus = corpora.MmCorpus('G:\wiki_dump\wiki_en_corpus') dictionary = corpora.Dictionary() dictionary.load('G:\wiki_dump\wiki_en_corpus.dict') new_doc = "Human computer interaction hello hello" new_vec = dictionary.doc2bow(new_doc.lower().split()) bow = dictionary.doc2bow(new_vec) tfidf = SaveLoad.load('G:\wiki_dump\wiki_en_tf_idf') tuple1 = (20, 1) tuple2 = (30, 2) query = [tuple1, tuple2] ''' The query is compound of the terms and the appropriate frequencies present in the textual fragment Note: TF-IDF is not aware of the string term representation and each term has to be transformed into the term id ''' vector = tfidf[query] print(vector)
def main(): arg1 = sys.argv[1] one_train, abstract_train, seven_train, month_train = defaultdict(list), defaultdict(list), defaultdict( list), defaultdict(list) one_test, seven_test, month_test = defaultdict(list), defaultdict(list), defaultdict(list) # nltk.download('stopwords') print("start pre-processing the data") bigram = SaveLoad.load("data/phrase_xxx/big_phrase.pickle") trigram = SaveLoad.load("data/phrase_xxx/trig_phrase.pickle") label_one = pd.read_pickle("data/label_one_new.pickle") label_seven = pd.read_pickle("data/label_seven.pickle") label_month = pd.read_pickle("data/label_month.pickle") print("starting the training selecting phase") Ding(label_one, bigram, trigram,types=arg1) #Ding_abstract(label_one, bigram, trigram,types=str(arg1)) '''os.chdir('/home/huicheng/PycharmProjects/stock/pickle')
def main(): parser = argparse.ArgumentParser( description= 'serializes the dictionary of a given binary .pkl or .pkl.bz2 bag-of-words file to a text-based id2word .txt file', epilog='Example: ./{} mycorpus-bow.pkl.bz2 mycorpus-dict.txt'.format( sys.argv[0])) parser.add_argument( 'model_pkl', type=argparse.FileType('r'), help='path to input .pkl or .pkl.bz2 bag-of-words model file ') parser.add_argument('id2word', type=argparse.FileType('w'), help='path to output .txt id2word file') args = parser.parse_args() input_model_pkl_path = args.model_pkl.name output_id2word_path = args.id2word.name program = os.path.basename(sys.argv[0]) logger = logging.getLogger(program) logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s') logging.root.level = logging.INFO logger.info('serializing id2word-mapping of {} to {}'.format( input_model_pkl_path, output_id2word_path)) model = SaveLoad.load(input_model_pkl_path) model.dictionary.save_as_text(output_id2word_path)
def preload_models(self): start = time.time() print("Preloading models...\n") # self.dictionary = corpora.Dictionary.load(self.serialize_dict) self.dictionary = SaveLoad.load(str(self.serialize_dict)) print("\tDictionary loaded.") self.tfidf = SaveLoad.load(str(self.serialize_tfidf)) print("\tTFIDF loaded.") self.similarities = SaveLoad.load(str(self.serialize_similarities)) print("\tSimilarities loaded.") # self.corpus_vector = corpora.MmCorpus(serialize_vector) print("\tPreloading Completed. time cost: {}".format( round(time.time() - start, 2)))
def test_saveload_func(self): dfilter = DocumentFilterTransform(odd_document_filter_func) docf_corpus = dfilter[self.vtcorp] pname = self.loader.pipeline_name('docfiltered') docf_corpus.save(pname) loaded_corpus = SaveLoad.load(pname) print log_corpus_stack(loaded_corpus) self.assertIsInstance(loaded_corpus, type(docf_corpus)) filtered_docs = [d for d in loaded_corpus] self.assertEqual(len(filtered_docs), len(self.vtcorp) / 2)
def main(): # nltk.download('stopwords') print("start pre-processing the data") bigram = SaveLoad.load("big_phrase.pickle") trigram = SaveLoad.load("trig_phrase.pickle") label_one = pd.read_pickle("label_one_new_GOOG.pickle")[ '2014-05-01':] # ['2006-11-20':'2013-11-21'] path = '/Users/maobu/Dropbox/stock/data/ding/' length = label_one.shape[0] train = label_one[0:int(length * 0.8)] validate = label_one[int(length * 0.8):int(length * 0.9)] test = label_one[int(length * 0.9):-1] train.reset_index().to_csv(path + "train_label_new.csv", index=False, encoding='utf-8') validate.reset_index().to_csv(path + "validate_label_new.csv", index=False, encoding='utf-8') test.reset_index().to_csv(path + "test_label_new.csv", index=False, encoding='utf-8') print("starting the training selecting phase") Ding_abstract(label_one, bigram, trigram, path)
def main(): # sql for pulling final format dataset sql = """ with keywords as ( select topic_id , group_concat(word, ', ') as Keywords from ( select topic_id , word , prob from topic_keywords order by topic_id , prob desc ) group by topic_id ) select sent_at , from_userid , topic_id as Chat_Topic , Keywords , lemma from chats join topic_labels using (chat_id) join keywords using (topic_id) join lemmas using (chat_id) """ # load up the model vocabulary so we can make sure that at least one word was included vocab = SaveLoad.load('../model_development/saved_models/model_user_day_room_10.id2word') # store to csv for ease of use in the visualization df = pd.DataFrame() with sqlite3.connect('../database/chat.db') as conn: total_len = pd.read_sql('select count(*) from chats', conn).iloc[0,0] progress = 0 for chunk in pd.read_sql(sql, conn, chunksize=100000): # remove chats that were too short to have any words from the model vocabulary chunk['vocab_words'] = [len(vocab.doc2bow(text)) for text in chunk['lemma'].str.split(' ').tolist()] df = df.append(chunk.loc[chunk['vocab_words']>0, ['sent_at', 'from_userid', 'Chat_Topic', 'Keywords']]) progress += len(chunk.index) print(round(progress/total_len*100, 2), '%...', end='\r') df.to_csv('../model_development/final_dominant_topic_text_df_FULL.txt', index=False, sep='\t') print(len(df.index), 'out of', total_len, 'chats were saved for final visualization!')
def main(): parser = argparse.ArgumentParser(description='serializes given binary .pkl or .pkl.bz2 file to text-based .mm model file in MatrixMarket format (requires that collection data used for creation of the .pkl file is still available!)', epilog='Example: ./{} bowmodel.pkl.bz2 bowmodel.mm'.format(sys.argv[0])) parser.add_argument('model_pkl', type=argparse.FileType('r'), help='path to input .pkl or .pkl.bz2 model file (bag-of-words, tf-idf)') parser.add_argument('model_mm', type=argparse.FileType('w'), help='path to output .mm model file') args = parser.parse_args() input_model_pkl_path = args.model_pkl.name output_model_mm_path = args.model_mm.name program = os.path.basename(sys.argv[0]) logger = logging.getLogger(program) logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s') logging.root.level = logging.INFO logger.info('serializing {} to {}'.format(input_model_pkl_path, output_model_mm_path)) model = SaveLoad.load(input_model_pkl_path) MmCorpus.serialize(output_model_mm_path, model)
def loadTfidfModel(self, type='offline'): ''' 加载Tfidf模型,若模型不存在则建立模型 ''' filePath = self.cachePath + '%s_tfidf_%s.model' % (self.name, type) if os.path.isfile(filePath): tfidfModel = SaveLoad.load(filePath) else: startTime = datetime.now() if type not in self.dictionary: self.loadDictionary(type) tfidfModel = TfidfModel(dictionary=self.dictionary[type]) # tfidfModel = makeTfidfModel(self.dictionary) tfidfModel.save(filePath) print('train tfidfModel time:', datetime.now() - startTime) self.tfidfModel[type] = tfidfModel return tfidfModel
def main(): parser = argparse.ArgumentParser( description= 'applies a trained lda model to a bag-of-words and saves the resulting corpus topics as a binary numpy dense matrix file (rows=documents, cols=topics)' ) parser.add_argument('--bow', type=argparse.FileType('r'), help='path to input bow file (.mm/.mm.bz2)', required=True) parser.add_argument('--model-prefix', type=argparse.FileType('r'), help='prefix of input binary lda model files', required=True) parser.add_argument('--document-topics', type=argparse.FileType('w'), help='path to output dense matrix .npz file') args = parser.parse_args() input_bow_path = args.bow.name input_model_prefix = args.model_prefix.name output_document_topics_path = args.document_topics.name logger.info('loading bow corpus from {}'.format(input_bow_path)) bow = MmCorpus(input_bow_path) logger.info('loading topic model from {}'.format(input_model_prefix)) model = SaveLoad.load(input_model_prefix) logger.info( 'generating dense document-topic-matrix: {} docs, {} topics'.format( bow.num_docs, model.num_topics)) document_topics = corpus2dense(model[bow], model.num_topics, bow.num_docs).T logger.info('generated dense matrix of shape {}'.format( document_topics.shape)) logger.debug('dense matrix\n{}'.format(document_topics)) logger.info( 'saving dense matrix to {}'.format(output_document_topics_path)) save_npz(output_document_topics_path, document_topics)
def main(args): logging.info('Initializing loaders with root %s, name %s' % ( args.root, args.name)) dloader = MultimodalShardedDatasetLoader(args.root, args.name) iloader = IndexLoader(args.root, args.name) logging.info('Loading pipeline with label %s' % args.label) pipeline_name = dloader.pipeline_name(args.label) pipeline = SaveLoad.load(pipeline_name) index_prefix = iloader.output_prefix(args.label) logging.info('Creating index with prefix %s' % index_prefix) dimension = safire.utils.transcorp.dimension(pipeline) index = similarities.Similarity(index_prefix, pipeline, num_features=dimension) iloader.save_index(index, args.label)
def main(args): if args.root == 'test': args.root = safire.get_test_data_root() args.name = 'test-data' # Initializing loaders logging.info('Initializing loaders with root %s, name %s' % ( args.root, args.name)) mdloader = MultimodalShardedDatasetLoader(args.root, args.name) mloader = ModelLoader(args.root, args.name) # Loading datasets if args.mm_label and (args.img_label or args.text_label): raise ValueError('Cannot specify both mm_label and' ' img_label/text_label.') if not args.img_label and not args.text_label and not args.mm_label: raise ValueError('Must specify text/image label or both or mm_label.') if args.img_label and args.text_label: logging.info('Will train a multimodal model: text label {0}, image ' 'label {1}.'.format(args.img_label, args.text_label)) logging.info('Assuming') #raise ValueError('Can only specify one of text and image label.') # Need to refactor dataset loading. # ...no more difference in principle between image labels and text labels. if args.img_label: logging.info('Loading image dataset with img. label {0}' ''.format(args.img_label)) pipeline_fname = mdloader.pipeline_name(args.img_label) # - load the pipeline img_pipeline = SaveLoad.load(fname=pipeline_fname) # cast to Dataset img_pipeline = Dataset(img_pipeline) if args.text_label: logging.info('Loading text dataset with text label {0}' ''.format(args.text_label)) pipeline_fname = mdloader.pipeline_name(args.text_label) # - load the pipeline text_pipeline = SaveLoad.load(fname=pipeline_fname) # - Cast to dataset text_pipeline = Dataset(text_pipeline, ensure_dense=True) # This is specifically a text transformation. if args.w2v: logging.info('Building and applying word2vec sampler. Note that ' 'this will mean no serialization is performed after' ' flattening, in case this is applied in a multimodal' ' setting.') w2v_trans = Word2VecTransformer(args.w2v, get_id2word_obj(text_pipeline)) w2v_sampler = Word2VecSamplingDatasetTransformer(w2v_trans) text_pipeline = w2v_sampler[text_pipeline] if (not args.text_label) and args.img_label: pipeline = img_pipeline elif args.text_label and (not args.img_label): pipeline = text_pipeline elif args.text_label and args.img_label: logging.info('Combining text and image sources into a multimodal ' 'pipeline.') logging.info('Text pipeline:\n{0}'.format(log_corpus_stack(text_pipeline))) logging.info('Image pipeline:\n{0}'.format(log_corpus_stack(img_pipeline))) # - Combine into CompositeDatasest mm_composite_dataset = CompositeDataset((text_pipeline, img_pipeline), names=('txt', 'img'), aligned=False) # - Flatten the dataset # - Load flatten indices t2i_file = os.path.join(mdloader.root, mdloader.layout.textdoc2imdoc) # t2i_map = parse_textdoc2imdoc_map(t2i_file) # t2i_list = [[text, image] # for text in t2i_map # for image in t2i_map[text]] # Sorting the indices is an optimization for underlying ShardedCorpus # serializers. t2i_indexes = compute_docname_flatten_mapping(mm_composite_dataset, t2i_file) # - Initialize flattening transformer flatten = FlattenComposite(mm_composite_dataset, indexes=t2i_indexes) # - Apply pipeline = flatten[mm_composite_dataset] if not args.w2v: # - Serialize, because multimodal indexed retrieval is *slow* mm_serialization_label = args.text_label + '__' + args.img_label serialization_name = mdloader.pipeline_serialization_target( mm_serialization_label) logging.info('Serializing flattened multimodal data to {0}.' ''.format(serialization_name)) logging.debug('Pre-serialization pipeline: {0}' ''.format(log_corpus_stack(pipeline))) serializer = Serializer(pipeline, ShardedCorpus, serialization_name, dim=dimension(pipeline), gensim_retrieval=False) pipeline = serializer[pipeline] mm_name = mdloader.pipeline_name(mm_serialization_label) pipeline.save(mm_name) else: logging.warn('Word2vec sampling active, cannot serialize flattened' 'corpus.') if args.mm_label: logging.info('Loading multimodal pipeline with label {0}' ''.format(args.mm_label)) pipeline_name = mdloader.pipeline_name(args.mm_label) pipeline = SaveLoad.load(pipeline_name) logging.info('Loaded pipeline:\n{0}'.format(log_corpus_stack(pipeline))) # - cast to dataset dataset = smart_cast_dataset(pipeline, test_p=0.1, devel_p=0.1, ensure_dense=True) logging.info('Setting up %s handle with output dimension %d' % (args.model, args.n_out)) # Loading model class try: model_class = getattr(models, args.model) except AttributeError: raise ValueError('Invalid model specified: %s' % args.model) check_model_dataset_compatibility(dataset, model_class) # Setting up model initialization arguments activation = init_activation(args.activation) if not args.backward_activation: args.backward_activation = args.activation backward_activation = init_activation(args.backward_activation) model_init_args = { 'heavy_debug': args.heavy_debug, 'activation': activation, 'backward_activation': backward_activation } if args.model == 'DenoisingAutoencoder': model_init_args['corruption_level'] = args.corruption model_init_args['reconstruction'] = args.reconstruction model_init_args['L1_norm'] = args.L1_norm model_init_args['L2_norm'] = args.L2_norm model_init_args['bias_decay'] = args.bias_decay model_init_args['sparsity_target'] = args.sparsity model_init_args['output_sparsity_target'] = args.output_sparsity if args.model == 'SparseDenoisingAutoencoder': model_init_args['corruption_level'] = args.corruption model_init_args['sparsity_target'] = args.sparsity model_init_args['reconstruction'] = args.reconstruction if args.model == 'RestrictedBoltzmannMachine' or args.model == 'ReplicatedSoftmax': model_init_args['sparsity_target'] = args.sparsity model_init_args['output_sparsity_target'] = args.output_sparsity model_init_args['CD_k'] = args.CD_k model_init_args['bias_decay'] = args.bias_decay model_init_args['CD_use_mean'] = not args.CD_use_sample model_init_args['prefer_extremes'] = args.prefer_extremes model_init_args['L1_norm'] = args.L1_norm model_init_args['L2_norm'] = args.L2_norm model_init_args['noisy_input'] = args.noisy_input logging.info('\nModel init args:' + u'\n'.join([u' {0}: {1}'.format(k, v) for k, v in model_init_args.items()])) # Set up model model_handle = model_class.setup(dataset, n_out=args.n_out, **model_init_args) logging.info('Setting up learner...') lloader = LearnerLoader(args.root, args.name) learner = None if args.resume: try: learner = lloader.load_learner(args.transformation_label) except Exception: logging.warn('Could not load learner for resuming training, will' 'start again. (Infix: %s)' % args.transformation_label) if not learner: learner = BaseSGDLearner( n_epochs=args.n_epochs, b_size=args.batch_size, validation_frequency=args.validation_frequency, track_weights=args.track_weights, track_weights_change=args.track_weights_change, plot_transformation=args.plot_transformation, plot_weights=args.plot_weights, plot_every=args.plot_every, plot_on_init=args.plot_on_init) # Intermediate model saving during training if args.save_every: learner_saving_overwrite = not args.no_overwrite_intermediate_saves learner.set_saving(infix=args.transformation_label, model_loader=mloader, save_every=args.save_every, overwrite=learner_saving_overwrite) logging.info('Setting up and training transformer...') # Training starts here. transformer = SafireTransformer(model_handle, dataset, learner, attempt_resume=args.resume, profile_training=args.profile_training, dense_throughput=True) # Training is done at this point. if args.no_save: args.no_corpus_transform = True args.no_dataset_transform = True args.no_save_transformer = True args.no_save_learner = True if not args.no_save_learner: logging.info('Saving learner with label %s' % args.transformation_label) lloader.save_learner(learner, args.transformation_label) if args.plot_monitors: logging.info('Plotting monitors to %s' % args.plot_monitors) plt.figure() monitor = learner.monitor training_cost = monitor['training_cost'] validation_cost = monitor['validation_cost'] tc_x = map(operator.itemgetter(0), training_cost) tc_y = map(operator.itemgetter(1), training_cost) vc_x = map(operator.itemgetter(0), validation_cost) vc_y = map(operator.itemgetter(1), validation_cost) plt.plot(tc_x, tc_y, 'b') plt.plot(vc_x, vc_y, 'g') plt.savefig(args.plot_monitors) if not args.no_save_transformer: logging.info('Saving transformer with label %s' % args.transformation_label) mloader.save_transformer(transformer, args.transformation_label) logging.info('Creating transformed corpus with label {0}' ''.format(args.transformation_label)) # This applies the transformation to the input corpus. pipeline = transformer[pipeline] # Serialization (this should be wrapped in some utility function?) # Doesn't always have to happen. (Difference from dataset2corpus.) if args.serialize: serializer_class = ShardedCorpus data_name = mdloader.pipeline_serialization_target(args.transformation_label) serialization_start_time = time.clock() logging.info('Starting serialization: {0}' ''.format(serialization_start_time)) serializer_block = Serializer(pipeline, serializer_class, data_name, dim=dimension(pipeline)) serialization_end_time = time.clock() logging.info('Serialization finished: {0}' ''.format(serialization_end_time)) pipeline = serializer_block[pipeline] # Now we save the pipeline. This is analogous to the Dataset2Corpus step. # In this way, also, the learned transformation is stored and can be # recycled, and other handles can be derived from the sftrans.model_handle. pipeline_savename = mdloader.pipeline_name(args.transformation_label) logging.info(' Pipeline name: {0}'.format(pipeline_savename)) pipeline.save(pipeline_savename)
"SW_Dev_Web", "Business_Analyst/BI", "PM_Mkt_Vertrieb", "IT_Admin", "SW_Dev_Mobile/UI_Design", "Infra_Server_Admin", "DB_Dev_Admin", "IT_Consultant", "Infra_Network_Admin", "Data_Engr", "SW_Dev_Web", "SW_Dev_Web_Frontend", "IT_Consultant_Operations", ] load_bigrams = SaveLoad.load("models/bigram_skills_title") def text_processing(text): """Normalize, tokenize, stem the original text string Args: text: string. String containing message for processing Returns: cleaned: list of strings. List containing normalized and stemmed word tokens with bigrams """ try: text = re.sub(r"(\d)", " ", text.lower()) text = re.sub("[%s]" % re.escape(string.punctuation), " ", text)
def load(name): try: return SaveLoad.load(name) except Exception as e: print(e)
import pickle import pandas as pd import numpy as np from gensim.models import KeyedVectors from gensim.utils import SaveLoad from google.colab import drive drive.mount('/content/gdrive') #Loading google vectors and gensim model from drive googlevecs = KeyedVectors.load_word2vec_format( '/content/gdrive/My Drive/GoogleNews-vectors-negative300.bin', binary=True) #Path should be given where google News wordvectors are saved model = SaveLoad.load( '/content/gdrive/My Drive/Thesis/wordvectors' ) #Path should be given where the model created using gensim is saved #Loading test list with open('test_list.txt', 'rb') as f: test_list = pickle.load(f) #Creating test set inputs test_inputs = list() for sentence in test_list: v = np.zeros(300) for word in sentence: if word in model.wv: v += model.wv[word]
def main(param_file=None): # setup p, base_path, output_dir = tools.setup(param_file) result_path = path.join(base_path, p['result_path']) lee_corpus = path.join(base_path, p['lee_corpus']) logger = tools.get_logger('gensim', path.join(output_dir, "run.log")) logger.info("running %s" % ' '.join(sys.argv)) # remember starting time for runtime evaluation start = datetime.now() # load model and corpus logger.info('loading word mapping') dictionary = Dictionary.load(path.join(result_path, p['run'], p['dict_extension'])) model_path = path.join(result_path, p['run'], p['lsi_ext']) logger.info('load model from: %s' % model_path) lsi = LsiModel.load(model_path) pre = SaveLoad.load(path.join(result_path, p['run'], p['pre_model_ext'])) logging.info('load smal lee corpus and preprocess') with open(lee_corpus, 'r') as f: preproc_lee_texts = preprocessing.preprocess_documents(f.readlines()) bow_lee_texts = [dictionary.doc2bow(text, allow_update=False, return_missing=False) for text in preproc_lee_texts] logger.info('transforming small lee corpus (only pre model)') corpus_pre = pre[bow_lee_texts] # read the human similarity data and flatten upper triangular human_sim_matrix = np.loadtxt(path.join(base_path, p['human_data_file'])) sim_m_size = np.shape(human_sim_matrix)[0] human_sim_vector = human_sim_matrix[np.triu_indices(sim_m_size, 1)] max_topics = lsi.num_topics logger.info("iterate from %d to %d dimensions (stepsize: %d)" % (p['min_dim'], max_topics, p['dim_step'])) iter_range = range(p['min_dim'], max_topics, p['dim_step']) res = np.zeros(len(iter_range)) for k, l in enumerate(iter_range): # do the lower dimensionality transformation lsi.num_topics = l corpus_lsi = lsi[corpus_pre] # compute pairwise similarity matrix of transformed corpus sim_matrix = np.zeros((len(corpus_lsi), len(corpus_lsi))) for i, par1 in enumerate(corpus_lsi): for j, par2 in enumerate(corpus_lsi): sim_matrix[i, j] = matutils.cossim(par1, par2) sim_vector = sim_matrix[np.triu_indices(len(corpus_lsi), 1)] # compute correlations cor = np.corrcoef(sim_vector, human_sim_vector) logger.info("step %d: correlation with lee data: %f" % (k, cor[0, 1])) res[k] = cor[0, 1] plt.figure() plt.plot(iter_range, res) plt.savefig(os.path.join(output_dir, 'cor_plot.' + p['plot_extension'])) plt.close() np.save(path.join(output_dir, 'model_dim_res.npy'), res) dif = datetime.now() - start logger.info("finished after %d days and %d secs" % (dif.days, dif.seconds))
def load_or_build_lda_model(conn): try: # load vocab dictionary vocab = SaveLoad.load('../model_development/saved_models/model_user_day_room_10.id2word') # load model ldamodel = LdaModel.load('../model_development/saved_models/model_user_day_room_10') print('Pretrained lda model loaded!') except: # query for aggregating texts per user per room per day sql = """ select group_concat(lemma, ' ') as lemma from lemmas join ( select chat_id , from_userid , strftime('%Y-%m-%d', sent_at) as sent_day , room_id from chats ) using (chat_id) where nullif(lemma, '') is not null group by from_userid , sent_day , room_id order by random(); """ # get vocabulary MIN_OCCURANCE = 100 vocab = Dictionary([pd.read_sql('select word from words where freq >= {}'.format(MIN_OCCURANCE), conn)['word'].tolist()]) # models for different number of topics N_EPOCHS = 10 n_topics = 10 style = 'user_day_room' # init model lda_model = LdaModel( id2word=vocab, num_topics=n_topics, alpha='auto', per_word_topics=True) # do training print('training model_{0}_{1}'.format(style, n_topics)) for epoch in range(N_EPOCHS): print('\tepoch', epoch, '...', end='\r') for chunk in pd.read_sql(sql, conn, chunksize=10000): chunk_corpa = [vocab.doc2bow(text) for text in chunk['lemma'].str.split(' ').tolist()] lda_model.update(chunk_corpa) print('\tepoch', epoch, '... done!') # Save model to disk. lda_model.save("saved_models/model_{0}_{1}".format(style, n_topics)) return vocab, ldamodel
def main(args): _starttime = time.clock() if args.root == 'test': args.root = safire.get_test_data_root() args.name = 'test-data' logging.info('Initializing dataset loader with root %s, name %s' % (args.root, args.name)) loader = MultimodalShardedDatasetLoader(args.root, args.name) if args.clear: raise NotImplementedError('Cleaning not implemented properly through' ' a loader/layout object.') if args.flatten: if args.f_text is None or args.f_images is None: raise argparse.ArgumentError('Must provide --f_text and --f_images' ' when attempting to flatten.') logging.info('Loading text pipeline and casting to dataset...') t_pipeline_name = loader.pipeline_name(args.f_text) t_pipeline = SaveLoad.load(t_pipeline_name) t_data = Dataset(t_pipeline) logging.info('Loading image pipeline and casting to dataset...') i_pipeline_name = loader.pipeline_name(args.f_images) i_pipeline = SaveLoad.load(i_pipeline_name) i_data = Dataset(i_pipeline) logging.info('Creating composite dataset...') mm_data = CompositeDataset((t_data, i_data), names=('text', 'img'), aligned=False) logging.info('Flattening dataset...') t2i_file = os.path.join(loader.root, loader.layout.textdoc2imdoc) flatten_indexes = compute_docname_flatten_mapping(mm_data, t2i_file) flatten = FlattenComposite(mm_data, indexes=flatten_indexes) flat_mm_data = flatten[mm_data] if not args.label: logging.info('Generating flattened label automatically...') args.label = '__'.join([args.f_text, args.f_images]) logging.info(' Generated label: {0}'.format(args.label)) logging.info('Serializing flattened data...') serialization_name = loader.pipeline_serialization_target(args.label) serializer = Serializer(flat_mm_data, ShardedCorpus, serialization_name) pipeline = serializer[flat_mm_data] logging.info('Saving pipeline...') pipeline_name = loader.pipeline_name(args.label) pipeline.save(fname=pipeline_name) return if args.input_label is not None: logging.info('Loading corpus with label %s' % args.input_label) pipeline_fname = loader.pipeline_name(args.input_label) pipeline = SaveLoad.load(pipeline_fname) logging.info('Loaded corpus report:\n') logging.info(log_corpus_stack(pipeline)) elif args.images: logging.info('Reading raw image data.') image_file = os.path.join(args.root, loader.layout.image_vectors) icorp = ImagenetCorpus(image_file, delimiter=';', dim=4096, label='') pipeline = icorp else: logging.info('Reading raw text data.') vtargs = {} if args.label: logging.info('VTextCorpus will have label %s' % args.label) vtargs['label'] = args.label if args.pos: logging.info('Constructing POS filter with values {0}' ''.format(list(args.pos))) vtargs['token_filter'] = PositionalTagTokenFilter(list(args.pos), 0) if args.pfilter: logging.info('Constructing positional filter: %d.' % args.pfilter) # If a fixed number of sentences is requested, use this. if args.pfilter % 1 == 0: args.pfilter = int(args.pfilter) vtargs['pfilter'] = args.pfilter if args.pfilter_fullfreq: vtargs['pfilter_full_freqs'] = args.pfilter_fullfreq if args.filter_capital: vtargs['filter_capital'] = True vtargs['tokens'] = args.tokens vtargs['sentences'] = args.sentences if args.tokens or args.sentences: # This already happens automatically inside VTextCorpus, but it # raises a warning we can avoid if we know about this in advance. vtargs['precompute_vtlist'] = False logging.info(u'Deriving corpus from loader with vtargs:\n{0}'.format( u'\n'.join(u' {0}: {1}'.format(k, v) for k, v in sorted(vtargs.items()))) ) vtcorp = loader.get_text_corpus(vtargs) # VTextCorpus initialization is still the same, refactor or not. logging.info('Corpus: %s' % str(vtcorp)) logging.info(' vtlist: %s' % str(vtcorp.input)) pipeline = vtcorp # Holds the data if args.tfidf: tfidf = TfidfModel(pipeline) pipeline = tfidf[pipeline] if args.top_k is not None: if args.images: logging.warn('Running a frequency-based transformer on image data' ' not a lot of sense makes, hmm?') logging.info('Running transformer with k=%i, discard_top=%i' % ( args.top_k, args.discard_top)) if args.profile_transformation: report, transformer = safire.utils.profile_run(_create_transformer, pipeline, args.top_k, args.discard_top) # Profiling output print report.getvalue() else: transformer = FrequencyBasedTransformer(pipeline, args.top_k, args.discard_top) pipeline = transformer[pipeline] if args.post_tfidf: post_tfidf = TfidfModel(pipeline) pipeline = post_tfidf[pipeline] if args.word2vec is not None: logging.info('Applying word2vec transformation with embeddings ' '{0}'.format(args.word2vec)) w2v_dictionary = get_id2word_obj(pipeline) # Extracting dictionary from FrequencyBasedTransform supported # through utils.transcorp.KeymapDict pipeline = convert_to_gensim(pipeline) word2vec = Word2VecTransformer(args.word2vec, w2v_dictionary, op=args.word2vec_op) pipeline = word2vec[pipeline] if args.w2v_filter_empty: print 'Applying word2vec empty doc filtering.' document_filter = DocumentFilterTransform(zero_length_filter) pipeline = document_filter[pipeline] if args.uniform_covariance: ucov = LeCunnVarianceScalingTransform(pipeline) pipeline = ucov[pipeline] if args.tanh: pipeline = convert_to_gensim(pipeline) tanh_transform = GeneralFunctionTransform(numpy.tanh, multiplicative_coef=args.tanh) pipeline = tanh_transform[pipeline] if args.capped_normalize is not None: logging.info('Normalizing each data point to ' 'max. value %f' % args.capped_normalize) cnorm_transform = CappedNormalizationTransform(pipeline, args.capped_normalize) pipeline = cnorm_transform[pipeline] if args.normalize is not None: logging.info('Normalizing each data point to %f' % args.normalize) norm_transform = NormalizationTransform(args.normalize) pipeline = norm_transform[pipeline] logging.info('Serializing...') # Rewrite as applying a Serializer block. if isinstance(pipeline, VTextCorpus): logging.info('Checking that VTextCorpus dimension is available.') #if not pipeline.precompute_vtlist: # logging.info(' ...to get dimension: precomputing vtlist.') # pipeline._precompute_vtlist(pipeline.input) if pipeline.n_processed < len(pipeline.vtlist): logging.info('Have to dry_run() the pipeline\'s VTextCorpus,' 'because we cannot derive its dimension.') if args.serialization_format == 'gensim': logging.info('...deferring dimension check to serialization,' ' as the requested serialization format does not' ' need dimension defined beforehand.') else: pipeline.dry_run() data_name = loader.pipeline_serialization_target(args.label) logging.info(' Data name: {0}'.format(data_name)) serializer_class = ShardedCorpus # Here, the 'serializer_class' will not be called directly. Instead, # a Serializer block will be built & applied. (Profiling serialization # currently not supported.) serialization_start_time = time.clock() logging.info('Starting serialization: {0}'.format(serialization_start_time)) sparse_serialization = False gensim_serialization = False if args.serialization_format == 'sparse': sparse_serialization = True elif args.serialization_format == 'gensim': gensim_serialization = True elif args.serialization_format != 'dense': logging.warn('Invalid serialization format specified ({0}), serializing' ' as dense.'.format(args.serialization_format)) serializer_block = Serializer(pipeline, serializer_class, data_name, dim=dimension(pipeline), gensim_serialization=gensim_serialization, sparse_serialization=sparse_serialization, overwrite=(not args.no_overwrite), shardsize=args.shardsize) serialization_end_time = time.clock() logging.info('Serialization finished: {0}'.format(serialization_end_time)) logging.debug('After serialization: n_processed = {0}' ''.format(safire.utils.transcorp.bottom_corpus(pipeline).n_processed)) pipeline = serializer_block[pipeline] assert isinstance(pipeline, SwapoutCorpus), 'Serialization not applied' \ ' correctly.' if args.index: iloader = IndexLoader(args.root, args.name) index_name = iloader.output_prefix(args.label) logging.info('Building index with name {0}'.format(index_name)) similarity_transformer = SimilarityTransformer(pipeline, index_name) # Should the pipeline get transformed? Or do we only want # the transformer? # What is the use case here? We need the *transformer*, not the # transformed data (that would be just the self-similarity of our # dataset), so we need to get some new input. We can retrieve # the pipeline.obj and lock the transformer onto another pipeline. pipeline = similarity_transformer[pipeline] logging.info('Corpus stats: {0} documents, {1} features.'.format( len(pipeline), safire.utils.transcorp.dimension(pipeline))) if not args.no_save_corpus: obj_name = loader.pipeline_name(args.label) logging.info('Saving pipeline to {0}'.format(obj_name)) pipeline.save(obj_name) # HACK: logging word2vec OOV if args.word2vec: # Report out-of-vocabulary statistics #oov_report = word2vec.report_oov() #logging.info(u'OOV report:\n%s' % oov_report) word2vec.log_oov() if args.word2vec_export: word2vec_to_export = word2vec.export_used() embeddings_dict = word2vec_to_export.embeddings with open(args.word2vec_export, 'wb') as w2v_export_handle: cPickle.dump(embeddings_dict, w2v_export_handle, protocol=-1) _endtime = time.clock() _totaltime = _endtime - _starttime logging.info('Total main() runtime: %d s' % int(_totaltime)) return
# body = str(art.loc[i,'content'])+ " " +str(art.loc[i,'title']) # cleaned = clean(body) # cleaned = word_tokenize(cleaned) # labeled_sentances.append(cleaned) # except: # print('Error Index:',index) # dct = Dictionary(labeled_sentances) # fit dictionary # dct.save("doc.dic") #################################################################################### #################################################################################### #################################################################################### dct = SaveLoad.load("doc.dic") def single_doc(doc_tokenized): tags = pos_tag(doc_tokenized) pos_score, neg_score = 0, 0 for t in tags: word = t[0] pos = t[1][0] if pos == 'J': part = 'a' elif pos == 'N': part = 'n' elif pos == 'R':
x[0] for x in get_womanly_words(model, adjectives_list, woman_words_list) ] print("\n{}:".format(source_mame)) print("\nManly adjectives: \n\t{}".format(", ".join(manly_words[:8]))) print("\nWomanly adjectives: \n\t{}".format(", ".join(womanly_words[:8]))) return manly_words, womanly_words if __name__ == '__main__': books_model_path = "../data/w2v_models/book_v1_model" tv_show_model_path = "../data/w2v_models/tv_show_v1_model" books_model = SaveLoad.load(books_model_path) tv_show_model = SaveLoad.load(tv_show_model_path) test_words_similarity(books_model, tv_show_model) # dim_reduction(books_model, tv_show_model) books_manly_words, books_womanly_words = get_similart_words_embd( books_model, source_mame='Books') tv_manly_words, tv_womanly_words = get_similart_words_embd( tv_show_model, source_mame='TV Show') similarity_df = pd.DataFrame([ books_manly_words, books_womanly_words, tv_manly_words, tv_womanly_words ]).T
def main(args): logging.info('Executing dataset_stats.py...') loader = MultimodalShardedDatasetLoader(args.root, args.name) # Loading and/or computing if not args.load: dataset_name = loader.pipeline_name(args.dataset) dataset = SaveLoad.load(dataset_name) dataset = convert_to_dense(dataset) # if args.text: # wrapper_dataset_name = loader.pipeline_name(args.dataset) # wrapper_dataset = SaveLoad.load(wrapper_dataset_name) # dataset = wrapper_dataset.data # vtcorp = wrapper_dataset.vtcorp # print 'Dimension of underlying text data: %d' % dimension(vtcorp) # print 'Dimension of dataset: %d' % dataset.dim # # The ShardedDataset, not the text-modality wrapper # else: # dataset = loader.load_img(args.dataset).data logging.info('Loaded dataset: %d items, dimension %d' % (len(dataset), dimension(dataset))) report, stats = safire.utils.profile_run(do_stats_init, args, dataset) else: with open(args.load) as input_handle: stats = cPickle.load(input_handle) stats.report() #stats.raw_cell_histogram() #stats.cell_mass_histogram(n=100) #stats.cell_cumulative_mass(n=100) max_feature = list(stats.feature_totals).index(max(stats.feature_totals)) #stats.feature_histogram(0, stats.n_items_processed, whole_dataset=True) #stats.feature_histogram(max_feature, # stats.n_items_processed, whole_dataset=True) #stats.item_histogram(0) #stats.item_histogram(3) #stats.item_histogram(117) #stats.feature_lecunn_covariances() #stats.nnz_histogram() inspected_features = sorted([ numpy.random.choice(stats.dataset_dim, replace=False) for _ in range(100)]) #inspection_matrix = stats.pairwise_feature_matrix(inspected_features, # safire.utils.matutils.maxn_sparse_rmse) #safire.utils.heatmap_matrix(inspection_matrix, 'MaxNormalized dataset RMSE') #pairwise_avg = numpy.average(inspection_matrix) logging.info('Sampling raw matrix...') n_raw_samples = min(len(dataset), 1000) raw_matrix = numpy.array([stats.dataset[idx] for idx in stats.d_idxs[:n_raw_samples]]) if args.activation_histogram: logging.info('Computing histogram of activations...') stats.rich_histogram(raw_matrix.ravel(), n_bins=100, with_zero=False, title='Feature activation histogram') if args.average_activations: logging.info('Computing average activations...') feature_totals = numpy.array(stats.feature_totals) avg_feature_totals = feature_totals / numpy.sum(feature_totals) plt.plot(sorted(avg_feature_totals)) plt.hist(avg_feature_totals, bins=20, color='red', histtype='step', orientation='horizontal') plt.title('Sorted feature means') plt.show() if args.normalize_covariance: logging.info('Normalizing covariance...') covariances = numpy.sum(raw_matrix ** 2, axis=0) / raw_matrix.shape[0] #print covariances[:10] scaled_raw_matrix = scale_to_unit_covariance(raw_matrix) scaled_covariances = numpy.sum(scaled_raw_matrix ** 2, axis=0) / scaled_raw_matrix.shape[0] plt.figure() plt.plot(covariances, color='b') plt.plot(scaled_covariances, color='r') plt.show() #stats.feature_histogram(max_feature, n_bins=100, whole_dataset=True) #stats.rich_histogram(raw_matrix[:,max_feature], n_bins=100) #stats.rich_histogram(scaled_raw_matrix[:,max_feature], n_bins=100) #stats.rich_histogram(raw_matrix[:,0], n_bins=100) #stats.rich_histogram(scaled_raw_matrix[:,0], n_bins=100) safire.utils.heatmap_matrix(numpy.absolute(scaled_raw_matrix), title='UCov. dataset heatmap', with_average=True, colormap='afmhot', vmin=0.0, vmax=stats.maximum) if args.raw: safire.utils.heatmap_matrix(numpy.absolute(raw_matrix), title='Dataset heatmap', with_average=True, colormap='afmhot', vmin=0.0, vmax=stats.maximum) stats.rich_histogram(raw_matrix.ravel()) if args.correlation: logging.info('Computing correlation...') if args.normalize_covariance: corrcoef_matrix = numpy.corrcoef(scaled_raw_matrix, rowvar=0) else: corrcoef_matrix = numpy.corrcoef(raw_matrix, rowvar=0) print 'Average correlation: %f' % numpy.average(corrcoef_matrix) plt.figure(facecolor='white', figsize=(8,6)) plt.pcolormesh(numpy.absolute(corrcoef_matrix), #title='Pearson C.Coef. heatmap', cmap='afmhot', vmin=0.0, vmax=1.0) plt.colorbar() plt.xlim([0,corrcoef_matrix.shape[1]]) plt.ylim([0,corrcoef_matrix.shape[0]]) plt.show() if args.tanh: logging.info('Plotting tanh transformation...') tanh_matrix = numpy.tanh(raw_matrix / args.tanh) stats.rich_histogram(tanh_matrix, n_bins=20, title='Tanh matrix histogram.') if args.hyper: logging.info('Plotting hyperbolic transformation...') hyp_matrix = raw_matrix / (raw_matrix + args.hyper) stats.rich_histogram(hyp_matrix, n_bins=100, title='x/(1+x) matrix histogram.') if args.sparsity: logging.info('Computing sparsity...') # One entry per feature, counts how many non-zero elements are there # in each column of the raw matrix. num_nnz = numpy.array([ len([i for i in raw_matrix[:,f] if f != 0]) for f in range(raw_matrix.shape[1]) ], dtype=numpy.float32) p_nnz = num_nnz / float(raw_matrix.shape[0]) plt.plot(sorted(p_nnz)) plt.hist(p_nnz, bins=20, histtype='stepped', color='r', orientation='horizontal') #print 'Pairwise average for %d random features: %f' % (len(inspected_features), # pairwise_avg) if args.save: with open(args.save, 'w') as output_handle: cPickle.dump(stats, output_handle, protocol=-1) logging.info('Exiting dataset_stats.py.')
from gensim import corpora, models, matutils import numpy as np from bson.objectid import ObjectId import pymongo import json client = pymongo.MongoClient() db = client.meetup #groups_col=db.groups groups_clean_extra_col = db.groups_clean_extra _id_df = pickle.load(open( './controllers/lsi_id_df.pkl', "rb" )) count_vectorizer = pickle.load(open( './controllers/count_vectorizer.pkl', "rb" )) tfidf = models.tfidfmodel.TfidfModel.load('./controllers/tfidf.pkl') lsi = models.LsiModel.load('./controllers/lsi') index = SaveLoad.load('./controllers/index.pkl') def recommend_texts(text,_id_df,count_vectorizer,tfidf,lsi,index): # _id_df = pickle.load(open( 'lsi_id_df.pkl', "rb" )) # count_vectorizer = pickle.load(open( 'count_vectorizer.pkl', "rb" )) # tfidf = models.tfidfmodel.TfidfModel.load('tfidf.pkl') # lsi = models.LsiModel.load('lsi') # #corpus_lsi = models.LsiModel.load('lsi_corpus') # index = SaveLoad.load('index.pkl') metis_vecs = count_vectorizer.transform(np.array(text)).transpose() metis_corpus= matutils.Sparse2Corpus(metis_vecs) metis_tfidf_corpus = tfidf[metis_corpus] metis_lsi_corpus = lsi[metis_tfidf_corpus] metis_doc_vecs = [doc for doc in metis_lsi_corpus] index.num_best=500 my_index=index[metis_doc_vecs]
def main(): parser = argparse.ArgumentParser( description= 'prints the topics of a LDA instance, sorted decreasing by thei average probabilities in the collection; prints some stats of the appereances of the most important terms of the most important topics' ) parser.add_argument('--bow', type=argparse.FileType('r'), help='path to input bow file (.mm/.mm.bz2)', required=True) parser.add_argument('--model-prefix', type=argparse.FileType('r'), help='prefix of input binary lda model files', required=True) args = parser.parse_args() input_bow_path = args.bow.name input_model_prefix = args.model_prefix.name logger.info('running with:\n{}'.format( pformat({ 'input_bow_path': input_bow_path, 'input_model_prefix': input_model_prefix }))) logger.info('loading bow corpus from {}'.format(input_bow_path)) bow = MmCorpus(input_bow_path) logger.info('loading topic model from {}'.format(input_model_prefix)) model = SaveLoad.load(input_model_prefix) logger.info('loaded {} docs, {} topics'.format(bow.num_docs, model.num_topics)) logger.info('generating sparse document-topic-matrix') document_topic_probs = dok_matrix((bow.num_docs, model.num_topics), dtype='d') for docid, topics in enumerate(model[bow]): for topicid, prob in topics: document_topic_probs[docid, topicid] = prob document_topic_probs = document_topic_probs.tocsr() logger.info( 'calculating average topic probabilities of the document collection') topics_avg = document_topic_probs.sum(axis=0) / bow.num_docs topics_max = document_topic_probs.max(axis=0).todense().tolist()[0] logger.debug('avg {} shape {}'.format(topics_avg, topics_avg.shape)) logger.debug('max {} len {}'.format(topics_max, len(topics_max))) topics_avg = list(enumerate(topics_avg.tolist()[0])) topics_avg.sort(key=lambda t: t[1], reverse=True) num_printed_terms = 10 logger.info('topics with highest average probabilities') for topicid, topic_avg_prob in topics_avg: logger.info( 'topic ID={0}, avgprob={1:.4f}, maxprob={2:.4f}, terms:\n{3}'. format(topicid, topic_avg_prob, topics_max[topicid], model.print_topic(topicid, topn=num_printed_terms))) num_top_topics = min(5, len(topics_avg)) num_top_terms = 20 top_topics = [topicid for topicid, topic_avg in topics_avg][:num_top_topics] logger.info( 'calculating stats of top-{}-topics {} with top-{}-terms per topic'. format(num_top_topics, top_topics, num_top_terms)) term_topics = defaultdict( list ) # mapping termid->topicids für alle termids, die in top-k von irgendwelchen topics enthalten for topicid in top_topics: for termid, prob in get_topic_terms(model, topicid, topn=num_top_terms): term_topics[termid].append(topicid) term_topics = dict(term_topics) num_different_docs_per_topic = {topicid: 0 for topicid in top_topics} sum_bow_values_per_topic = {topicid: 0 for topicid in top_topics} for docid, document_term_bow in enumerate(bow): doc_topics = set() for termid, bow_value in document_term_bow: if termid in term_topics: for topicid in term_topics[termid]: doc_topics.add(topicid) sum_bow_values_per_topic[topicid] += bow_value for topicid in doc_topics: num_different_docs_per_topic[topicid] += 1 for topicid in top_topics: logger.info( 'top-{}-terms of topic {} occure {} times in collection'.format( num_top_terms, topicid, int(sum_bow_values_per_topic[topicid]))) logger.info( 'top-{}-terms of topic {} occure {} different documents'.format( num_top_terms, topicid, num_different_docs_per_topic[topicid]))
# words = list(gensim.utils.simple_tokenize(line)) if words: # print(words) # yield words f1.write(" ".join(words)) f1.write("\n") # yield list(gensim.models.word2vec.LineSentence(name)) def make_bi_tri(paths, tri=False): sentences = PathLineSentences(paths) phases = Phrases(sentences) bigram = Phraser(phases) bigram.save() if tri: triphases = Phrases(bigram[sentences]) trigram = Phraser(triphases) trigram.save() if __name__ == "__main__": tar = tarfile.open("../saos-dump-23.02.2018.tar.gz", "r:gz") give_me(tar) tar.close() from_file("data", lower=True) sentences = PathLineSentences(os.path.join(os.getcwd(), "preprocesssed")) bigram = SaveLoad.load("bigram") trigram = SaveLoad.load("trigram") word = Word2Vec(trigram[bigram[sentences]], window=5, sg=0, size=300, min_count=3, workers=7) word.save("counted_model")
def load_phraser(modelpath: PathType, n: int) -> PhraserType: modelpath = os.path.join(modelpath, f"{n}gramsphraser") phraser = gensimSaveLoad.load(modelpath) return phraser