def train_vectors(train_article_name,
                  test_data,
                  corpus,
                  window=window,
                  epochs=epochs,
                  cd_data=cd_data):
    """
    Trains the Word2Vec model using the test_data, then saves the vectors in the
    models directory.

    train_article_name -> String of file name excluding the path.
    test_data -> Fully tokenized read_article
         from ProcessArticle.full_tokenize(doc).
    corpus -> Corpus dictionary
         from ProcessArticle.generate_corpus(train_article.tolest)
    window -> Default window parameter from Word2Vec.
    epochs -> Default epochs parameter from Word2vec.
    cd_data -> data directory path as a string.

    """

    w2v = Word2Vec(test_data, window=window)
    w2v.train(train_data, total_words=len(corpus), epochs=epochs)
    SaveLoad.save(w2v, cd_models + 'vectors.w2v')

    return w2v.wv
def main():
    arg1 = sys.argv[1]
    one_train, abstract_train, seven_train, month_train = defaultdict(
        list), defaultdict(list), defaultdict(list), defaultdict(list)
    one_test, seven_test, month_test = defaultdict(list), defaultdict(
        list), defaultdict(list)
    # nltk.download('stopwords')
    print("start pre-processing the data")
    bigram = SaveLoad.load("data/phrase_xxx/big_phrase.pickle")
    trigram = SaveLoad.load("data/phrase_xxx/trig_phrase.pickle")
    label_one = pd.read_pickle("data/label_one_new.pickle")
    print("starting the training selecting phase")
    Ding(label_one, bigram, trigram)
    #Ding_abstract(label_one,bigram,trigram,types=arg1)
    '''
Exemple #3
0
def load_tf_idf():
    #https://radimrehurek.com/gensim/tut2.html

    '''
        corpus = corpora.MmCorpus('G:\wiki_dump\wiki_en_corpus')
        corpus.serialize('G:\wiki_dump2\wiki_en_tf_idf',
                         corpus,
                         'G:\wiki_dump2\id2word',
                         'G:\wiki_dump2\index')
    '''


    corpus = corpora.MmCorpus('G:\wiki_dump\wiki_en_corpus')
    dictionary = corpora.Dictionary()
    dictionary.load('G:\wiki_dump\wiki_en_corpus.dict')

    new_doc = "Human computer interaction hello hello"
    new_vec = dictionary.doc2bow(new_doc.lower().split())
    bow = dictionary.doc2bow(new_vec)

    tfidf = SaveLoad.load('G:\wiki_dump\wiki_en_tf_idf')

    tuple1 = (20, 1)
    tuple2 = (30, 2)
    query = [tuple1, tuple2]
    '''
    The query is compound of the terms and the appropriate frequencies present in the textual fragment
    Note: TF-IDF is not aware of the string term representation and each term has to be transformed into the term id
    '''
    vector = tfidf[query]
    print(vector)
def main():
    arg1 = sys.argv[1]
    one_train, abstract_train, seven_train, month_train = defaultdict(list), defaultdict(list), defaultdict(
        list), defaultdict(list)
    one_test, seven_test, month_test = defaultdict(list), defaultdict(list), defaultdict(list)
    # nltk.download('stopwords')
    print("start pre-processing the data")
    bigram = SaveLoad.load("data/phrase_xxx/big_phrase.pickle")
    trigram = SaveLoad.load("data/phrase_xxx/trig_phrase.pickle")
    label_one = pd.read_pickle("data/label_one_new.pickle")
    label_seven = pd.read_pickle("data/label_seven.pickle")
    label_month = pd.read_pickle("data/label_month.pickle")
    print("starting the training selecting phase")
    Ding(label_one, bigram, trigram,types=arg1)
    #Ding_abstract(label_one, bigram, trigram,types=str(arg1))
    '''os.chdir('/home/huicheng/PycharmProjects/stock/pickle')
Exemple #5
0
def main():
    parser = argparse.ArgumentParser(
        description=
        'serializes the dictionary of a given binary .pkl or .pkl.bz2 bag-of-words file to a text-based id2word .txt file',
        epilog='Example: ./{} mycorpus-bow.pkl.bz2 mycorpus-dict.txt'.format(
            sys.argv[0]))
    parser.add_argument(
        'model_pkl',
        type=argparse.FileType('r'),
        help='path to input .pkl or .pkl.bz2 bag-of-words model file ')
    parser.add_argument('id2word',
                        type=argparse.FileType('w'),
                        help='path to output .txt id2word file')
    args = parser.parse_args()
    input_model_pkl_path = args.model_pkl.name
    output_id2word_path = args.id2word.name

    program = os.path.basename(sys.argv[0])
    logger = logging.getLogger(program)
    logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s')
    logging.root.level = logging.INFO
    logger.info('serializing id2word-mapping of {} to {}'.format(
        input_model_pkl_path, output_id2word_path))
    model = SaveLoad.load(input_model_pkl_path)
    model.dictionary.save_as_text(output_id2word_path)
    def preload_models(self):

        start = time.time()
        print("Preloading models...\n")

        # self.dictionary = corpora.Dictionary.load(self.serialize_dict)
        self.dictionary = SaveLoad.load(str(self.serialize_dict))
        print("\tDictionary loaded.")

        self.tfidf = SaveLoad.load(str(self.serialize_tfidf))
        print("\tTFIDF loaded.")

        self.similarities = SaveLoad.load(str(self.serialize_similarities))
        print("\tSimilarities loaded.")

        # self.corpus_vector = corpora.MmCorpus(serialize_vector)
        print("\tPreloading Completed. time cost: {}".format(
            round(time.time() - start, 2)))
    def test_saveload_func(self):
        dfilter = DocumentFilterTransform(odd_document_filter_func)
        docf_corpus = dfilter[self.vtcorp]

        pname = self.loader.pipeline_name('docfiltered')
        docf_corpus.save(pname)
        loaded_corpus = SaveLoad.load(pname)
        print log_corpus_stack(loaded_corpus)
        self.assertIsInstance(loaded_corpus, type(docf_corpus))

        filtered_docs = [d for d in loaded_corpus]
        self.assertEqual(len(filtered_docs), len(self.vtcorp) / 2)
def main():
    # nltk.download('stopwords')
    print("start pre-processing the data")
    bigram = SaveLoad.load("big_phrase.pickle")
    trigram = SaveLoad.load("trig_phrase.pickle")
    label_one = pd.read_pickle("label_one_new_GOOG.pickle")[
        '2014-05-01':]  # ['2006-11-20':'2013-11-21']
    path = '/Users/maobu/Dropbox/stock/data/ding/'
    length = label_one.shape[0]
    train = label_one[0:int(length * 0.8)]
    validate = label_one[int(length * 0.8):int(length * 0.9)]
    test = label_one[int(length * 0.9):-1]
    train.reset_index().to_csv(path + "train_label_new.csv",
                               index=False,
                               encoding='utf-8')
    validate.reset_index().to_csv(path + "validate_label_new.csv",
                                  index=False,
                                  encoding='utf-8')
    test.reset_index().to_csv(path + "test_label_new.csv",
                              index=False,
                              encoding='utf-8')
    print("starting the training selecting phase")
    Ding_abstract(label_one, bigram, trigram, path)
def main():

	# sql for pulling final format dataset
	sql = """
	with keywords as (
		select topic_id
			, group_concat(word, ', ') as Keywords
		from (
			select topic_id
				, word
				, prob
			from topic_keywords
			order by topic_id
				, prob desc
			)
		group by topic_id
		)

	select sent_at
		, from_userid
		, topic_id as Chat_Topic
		, Keywords
		, lemma
	from chats
		join topic_labels using (chat_id)
		join keywords using (topic_id)
		join lemmas using (chat_id)

	"""

	# load up the model vocabulary so we can make sure that at least one word was included
	vocab = SaveLoad.load('../model_development/saved_models/model_user_day_room_10.id2word')

	# store to csv for ease of use in the visualization
	df = pd.DataFrame()
	with sqlite3.connect('../database/chat.db') as conn:
		total_len = pd.read_sql('select count(*) from chats', conn).iloc[0,0]
		progress = 0
		for chunk in pd.read_sql(sql, conn, chunksize=100000):

			# remove chats that were too short to have any words from the model vocabulary
			chunk['vocab_words'] = [len(vocab.doc2bow(text)) for text in chunk['lemma'].str.split(' ').tolist()]
			df = df.append(chunk.loc[chunk['vocab_words']>0, ['sent_at', 'from_userid', 'Chat_Topic', 'Keywords']])
			progress += len(chunk.index)
			print(round(progress/total_len*100, 2), '%...', end='\r')


	df.to_csv('../model_development/final_dominant_topic_text_df_FULL.txt', index=False, sep='\t')
	print(len(df.index), 'out of', total_len, 'chats were saved for final visualization!')
Exemple #10
0
def main():
    parser = argparse.ArgumentParser(description='serializes given binary .pkl or .pkl.bz2 file to text-based .mm model file in MatrixMarket format (requires that collection data used for creation of the .pkl file is still available!)', epilog='Example: ./{} bowmodel.pkl.bz2 bowmodel.mm'.format(sys.argv[0]))
    parser.add_argument('model_pkl', type=argparse.FileType('r'), help='path to input .pkl or .pkl.bz2 model file (bag-of-words, tf-idf)')
    parser.add_argument('model_mm', type=argparse.FileType('w'), help='path to output .mm model file')
    args = parser.parse_args()
    input_model_pkl_path = args.model_pkl.name
    output_model_mm_path = args.model_mm.name
    
    program = os.path.basename(sys.argv[0])
    logger = logging.getLogger(program)
    logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s')    
    logging.root.level = logging.INFO
    logger.info('serializing {} to {}'.format(input_model_pkl_path, output_model_mm_path))
    model = SaveLoad.load(input_model_pkl_path)
    MmCorpus.serialize(output_model_mm_path, model)
Exemple #11
0
 def loadTfidfModel(self, type='offline'):
     '''
     加载Tfidf模型,若模型不存在则建立模型
     '''
     filePath = self.cachePath + '%s_tfidf_%s.model' % (self.name, type)
     if os.path.isfile(filePath):
         tfidfModel = SaveLoad.load(filePath)
     else:
         startTime = datetime.now()
         if type not in self.dictionary:
             self.loadDictionary(type)
         tfidfModel = TfidfModel(dictionary=self.dictionary[type])
         # tfidfModel = makeTfidfModel(self.dictionary)
         tfidfModel.save(filePath)
         print('train tfidfModel time:', datetime.now() - startTime)
     self.tfidfModel[type] = tfidfModel
     return tfidfModel
Exemple #12
0
def main():
    parser = argparse.ArgumentParser(
        description=
        'applies a trained lda model to a bag-of-words and saves the resulting corpus topics as a binary numpy dense matrix file (rows=documents, cols=topics)'
    )
    parser.add_argument('--bow',
                        type=argparse.FileType('r'),
                        help='path to input bow file (.mm/.mm.bz2)',
                        required=True)
    parser.add_argument('--model-prefix',
                        type=argparse.FileType('r'),
                        help='prefix of input binary lda model files',
                        required=True)
    parser.add_argument('--document-topics',
                        type=argparse.FileType('w'),
                        help='path to output dense matrix .npz file')

    args = parser.parse_args()
    input_bow_path = args.bow.name
    input_model_prefix = args.model_prefix.name
    output_document_topics_path = args.document_topics.name

    logger.info('loading bow corpus from {}'.format(input_bow_path))
    bow = MmCorpus(input_bow_path)
    logger.info('loading topic model from {}'.format(input_model_prefix))
    model = SaveLoad.load(input_model_prefix)

    logger.info(
        'generating dense document-topic-matrix: {} docs, {} topics'.format(
            bow.num_docs, model.num_topics))
    document_topics = corpus2dense(model[bow], model.num_topics,
                                   bow.num_docs).T
    logger.info('generated dense matrix of shape {}'.format(
        document_topics.shape))
    logger.debug('dense matrix\n{}'.format(document_topics))

    logger.info(
        'saving dense matrix to {}'.format(output_document_topics_path))
    save_npz(output_document_topics_path, document_topics)
Exemple #13
0
def main(args):

    logging.info('Initializing loaders with root %s, name %s' % (
        args.root, args.name))

    dloader = MultimodalShardedDatasetLoader(args.root, args.name)
    iloader = IndexLoader(args.root, args.name)

    logging.info('Loading pipeline with label %s' % args.label)

    pipeline_name = dloader.pipeline_name(args.label)
    pipeline = SaveLoad.load(pipeline_name)

    index_prefix = iloader.output_prefix(args.label)

    logging.info('Creating index with prefix %s' % index_prefix)

    dimension = safire.utils.transcorp.dimension(pipeline)
    index = similarities.Similarity(index_prefix, pipeline,
                                    num_features=dimension)

    iloader.save_index(index, args.label)
Exemple #14
0
def main(args):

    if args.root == 'test':
        args.root = safire.get_test_data_root()
        args.name = 'test-data'

    # Initializing loaders
    logging.info('Initializing loaders with root %s, name %s' % (
        args.root, args.name))

    mdloader = MultimodalShardedDatasetLoader(args.root, args.name)
    mloader = ModelLoader(args.root, args.name)

    # Loading datasets
    if args.mm_label and (args.img_label or args.text_label):
        raise ValueError('Cannot specify both mm_label and'
                         ' img_label/text_label.')

    if not args.img_label and not args.text_label and not args.mm_label:
        raise ValueError('Must specify text/image label or both or mm_label.')

    if args.img_label and args.text_label:
        logging.info('Will train a multimodal model: text label {0}, image '
                     'label {1}.'.format(args.img_label, args.text_label))

        logging.info('Assuming')
        #raise ValueError('Can only specify one of text and image label.')

    # Need to refactor dataset loading.
    # ...no more difference in principle between image labels and text labels.
    if args.img_label:
        logging.info('Loading image dataset with img. label {0}'
                     ''.format(args.img_label))
        pipeline_fname = mdloader.pipeline_name(args.img_label)

        #  - load the pipeline
        img_pipeline = SaveLoad.load(fname=pipeline_fname)
        # cast to Dataset
        img_pipeline = Dataset(img_pipeline)

    if args.text_label:
        logging.info('Loading text dataset with text label {0}'
                     ''.format(args.text_label))
        pipeline_fname = mdloader.pipeline_name(args.text_label)

        #  - load the pipeline
        text_pipeline = SaveLoad.load(fname=pipeline_fname)
        # - Cast to dataset
        text_pipeline = Dataset(text_pipeline, ensure_dense=True)

        # This is specifically a text transformation.
        if args.w2v:
            logging.info('Building and applying word2vec sampler. Note that '
                         'this will mean no serialization is performed after'
                         ' flattening, in case this is applied in a multimodal'
                         ' setting.')
            w2v_trans = Word2VecTransformer(args.w2v,
                                            get_id2word_obj(text_pipeline))
            w2v_sampler = Word2VecSamplingDatasetTransformer(w2v_trans)

            text_pipeline = w2v_sampler[text_pipeline]

    if (not args.text_label) and args.img_label:
        pipeline = img_pipeline
    elif args.text_label and (not args.img_label):
        pipeline = text_pipeline
    elif args.text_label and args.img_label:
        logging.info('Combining text and image sources into a multimodal '
                     'pipeline.')
        logging.info('Text pipeline:\n{0}'.format(log_corpus_stack(text_pipeline)))
        logging.info('Image pipeline:\n{0}'.format(log_corpus_stack(img_pipeline)))

        # - Combine into CompositeDatasest
        mm_composite_dataset = CompositeDataset((text_pipeline, img_pipeline),
                                                names=('txt', 'img'),
                                                aligned=False)
        # - Flatten the dataset
        #    - Load flatten indices
        t2i_file = os.path.join(mdloader.root,
                                mdloader.layout.textdoc2imdoc)
        # t2i_map = parse_textdoc2imdoc_map(t2i_file)
        # t2i_list = [[text, image]
        #             for text in t2i_map
        #             for image in t2i_map[text]]
        # Sorting the indices is an optimization for underlying ShardedCorpus
        # serializers.
        t2i_indexes = compute_docname_flatten_mapping(mm_composite_dataset,
                                                      t2i_file)

        #    - Initialize flattening transformer
        flatten = FlattenComposite(mm_composite_dataset, indexes=t2i_indexes)

        #    - Apply
        pipeline = flatten[mm_composite_dataset]

        if not args.w2v:
            #    - Serialize, because multimodal indexed retrieval is *slow*
            mm_serialization_label = args.text_label + '__' + args.img_label
            serialization_name = mdloader.pipeline_serialization_target(
                mm_serialization_label)
            logging.info('Serializing flattened multimodal data to {0}.'
                         ''.format(serialization_name))

            logging.debug('Pre-serialization pipeline: {0}'
                          ''.format(log_corpus_stack(pipeline)))
            serializer = Serializer(pipeline, ShardedCorpus, serialization_name,
                                    dim=dimension(pipeline),
                                    gensim_retrieval=False)
            pipeline = serializer[pipeline]

            mm_name = mdloader.pipeline_name(mm_serialization_label)
            pipeline.save(mm_name)
        else:
            logging.warn('Word2vec sampling active, cannot serialize flattened'
                         'corpus.')

    if args.mm_label:
        logging.info('Loading multimodal pipeline with label {0}'
                     ''.format(args.mm_label))
        pipeline_name = mdloader.pipeline_name(args.mm_label)
        pipeline = SaveLoad.load(pipeline_name)

    logging.info('Loaded pipeline:\n{0}'.format(log_corpus_stack(pipeline)))

    #  - cast to dataset
    dataset = smart_cast_dataset(pipeline, test_p=0.1, devel_p=0.1,
                                 ensure_dense=True)

    logging.info('Setting up %s handle with output dimension %d' % (args.model,
                                                                    args.n_out))
    # Loading model class
    try:
        model_class = getattr(models, args.model)
    except AttributeError:
        raise ValueError('Invalid model specified: %s' % args.model)

    check_model_dataset_compatibility(dataset, model_class)

    # Setting up model initialization arguments
    activation = init_activation(args.activation)
    if not args.backward_activation:
        args.backward_activation = args.activation
    backward_activation = init_activation(args.backward_activation)

    model_init_args = {
        'heavy_debug': args.heavy_debug,
        'activation': activation,
        'backward_activation': backward_activation
    }
    if args.model == 'DenoisingAutoencoder':
        model_init_args['corruption_level'] = args.corruption
        model_init_args['reconstruction'] = args.reconstruction
        model_init_args['L1_norm'] = args.L1_norm
        model_init_args['L2_norm'] = args.L2_norm
        model_init_args['bias_decay'] = args.bias_decay
        model_init_args['sparsity_target'] = args.sparsity
        model_init_args['output_sparsity_target'] = args.output_sparsity

    if args.model == 'SparseDenoisingAutoencoder':
        model_init_args['corruption_level'] = args.corruption
        model_init_args['sparsity_target'] = args.sparsity
        model_init_args['reconstruction'] = args.reconstruction

    if args.model == 'RestrictedBoltzmannMachine' or args.model == 'ReplicatedSoftmax':
        model_init_args['sparsity_target'] = args.sparsity
        model_init_args['output_sparsity_target'] = args.output_sparsity
        model_init_args['CD_k'] = args.CD_k
        model_init_args['bias_decay'] = args.bias_decay
        model_init_args['CD_use_mean'] = not args.CD_use_sample
        model_init_args['prefer_extremes'] = args.prefer_extremes
        model_init_args['L1_norm'] = args.L1_norm
        model_init_args['L2_norm'] = args.L2_norm
        model_init_args['noisy_input'] = args.noisy_input

    logging.info('\nModel init args:' +
                 u'\n'.join([u'  {0}: {1}'.format(k, v)
                             for k, v in model_init_args.items()]))

    # Set up model
    model_handle = model_class.setup(dataset, n_out=args.n_out,
                                     **model_init_args)

    logging.info('Setting up learner...')

    lloader = LearnerLoader(args.root, args.name)

    learner = None
    if args.resume:
        try:
            learner = lloader.load_learner(args.transformation_label)
        except Exception:
            logging.warn('Could not load learner for resuming training, will'
                         'start again. (Infix: %s)' % args.transformation_label)

    if not learner:
        learner = BaseSGDLearner(
            n_epochs=args.n_epochs,
            b_size=args.batch_size,
            validation_frequency=args.validation_frequency,
            track_weights=args.track_weights,
            track_weights_change=args.track_weights_change,
            plot_transformation=args.plot_transformation,
            plot_weights=args.plot_weights,
            plot_every=args.plot_every,
            plot_on_init=args.plot_on_init)

    # Intermediate model saving during training
    if args.save_every:

        learner_saving_overwrite = not args.no_overwrite_intermediate_saves
        learner.set_saving(infix=args.transformation_label,
                           model_loader=mloader,
                           save_every=args.save_every,
                           overwrite=learner_saving_overwrite)

    logging.info('Setting up and training transformer...')

    # Training starts here.
    transformer = SafireTransformer(model_handle, dataset, learner,
                                    attempt_resume=args.resume,
                                    profile_training=args.profile_training,
                                    dense_throughput=True)

    # Training is done at this point.

    if args.no_save:
        args.no_corpus_transform = True
        args.no_dataset_transform = True
        args.no_save_transformer = True
        args.no_save_learner = True

    if not args.no_save_learner:

        logging.info('Saving learner with label %s' % args.transformation_label)
        lloader.save_learner(learner, args.transformation_label)

    if args.plot_monitors:

        logging.info('Plotting monitors to %s' % args.plot_monitors)
        plt.figure()
        monitor = learner.monitor
        training_cost = monitor['training_cost']
        validation_cost = monitor['validation_cost']

        tc_x = map(operator.itemgetter(0), training_cost)
        tc_y = map(operator.itemgetter(1), training_cost)
        vc_x = map(operator.itemgetter(0), validation_cost)
        vc_y = map(operator.itemgetter(1), validation_cost)

        plt.plot(tc_x, tc_y, 'b')
        plt.plot(vc_x, vc_y, 'g')

        plt.savefig(args.plot_monitors)

    if not args.no_save_transformer:

        logging.info('Saving transformer with label %s' % args.transformation_label)
        mloader.save_transformer(transformer, args.transformation_label)

    logging.info('Creating transformed corpus with label {0}'
                 ''.format(args.transformation_label))
    # This applies the transformation to the input corpus.
    pipeline = transformer[pipeline]

    # Serialization (this should be wrapped in some utility function?)
    # Doesn't always have to happen. (Difference from dataset2corpus.)
    if args.serialize:
        serializer_class = ShardedCorpus
        data_name = mdloader.pipeline_serialization_target(args.transformation_label)
        serialization_start_time = time.clock()
        logging.info('Starting serialization: {0}'
                     ''.format(serialization_start_time))
        serializer_block = Serializer(pipeline, serializer_class,
                                      data_name,
                                      dim=dimension(pipeline))
        serialization_end_time = time.clock()
        logging.info('Serialization finished: {0}'
                     ''.format(serialization_end_time))

        pipeline = serializer_block[pipeline]

    # Now we save the pipeline. This is analogous to the Dataset2Corpus step.
    # In this way, also, the learned transformation is stored and can be
    # recycled, and other handles can be derived from the sftrans.model_handle.
    pipeline_savename = mdloader.pipeline_name(args.transformation_label)
    logging.info('    Pipeline name: {0}'.format(pipeline_savename))

    pipeline.save(pipeline_savename)
Exemple #15
0
    "SW_Dev_Web",
    "Business_Analyst/BI",
    "PM_Mkt_Vertrieb",
    "IT_Admin",
    "SW_Dev_Mobile/UI_Design",
    "Infra_Server_Admin",
    "DB_Dev_Admin",
    "IT_Consultant",
    "Infra_Network_Admin",
    "Data_Engr",
    "SW_Dev_Web",
    "SW_Dev_Web_Frontend",
    "IT_Consultant_Operations",
]

load_bigrams = SaveLoad.load("models/bigram_skills_title")


def text_processing(text):
    """Normalize, tokenize, stem the original text string

    Args:
    text: string. String containing message for processing

    Returns:
    cleaned: list of strings. List containing normalized and stemmed word tokens with bigrams
    """

    try:
        text = re.sub(r"(\d)", " ", text.lower())
        text = re.sub("[%s]" % re.escape(string.punctuation), " ", text)
Exemple #16
0
def load(name):
    try:
        return SaveLoad.load(name)
    except Exception as e:
        print(e)
Exemple #17
0
import pickle
import pandas as pd
import numpy as np
from gensim.models import KeyedVectors
from gensim.utils import SaveLoad
from google.colab import drive
drive.mount('/content/gdrive')

#Loading google vectors and gensim model from drive

googlevecs = KeyedVectors.load_word2vec_format(
    '/content/gdrive/My Drive/GoogleNews-vectors-negative300.bin',
    binary=True)  #Path should be given where google News wordvectors are saved
model = SaveLoad.load(
    '/content/gdrive/My Drive/Thesis/wordvectors'
)  #Path should be given where the model created using gensim is saved

#Loading test list

with open('test_list.txt', 'rb') as f:
    test_list = pickle.load(f)

#Creating test set inputs

test_inputs = list()
for sentence in test_list:
    v = np.zeros(300)
    for word in sentence:
        if word in model.wv:
            v += model.wv[word]
Exemple #18
0
def main(param_file=None):

    # setup
    p, base_path, output_dir = tools.setup(param_file)
    result_path = path.join(base_path, p['result_path'])
    lee_corpus = path.join(base_path, p['lee_corpus'])
    logger = tools.get_logger('gensim', path.join(output_dir, "run.log"))
    logger.info("running %s" % ' '.join(sys.argv))

    # remember starting time for runtime evaluation
    start = datetime.now()

    # load model and corpus
    logger.info('loading word mapping')
    dictionary = Dictionary.load(path.join(result_path,
                                           p['run'], p['dict_extension']))

    model_path = path.join(result_path, p['run'], p['lsi_ext'])
    logger.info('load model from: %s' % model_path)
    lsi = LsiModel.load(model_path)
    pre = SaveLoad.load(path.join(result_path, p['run'], p['pre_model_ext']))

    logging.info('load smal lee corpus and preprocess')
    with open(lee_corpus, 'r') as f:
        preproc_lee_texts = preprocessing.preprocess_documents(f.readlines())
    bow_lee_texts = [dictionary.doc2bow(text,
                                        allow_update=False,
                                        return_missing=False)
                    for text in preproc_lee_texts]

    logger.info('transforming small lee corpus (only pre model)')
    corpus_pre = pre[bow_lee_texts]

    # read the human similarity data and flatten upper triangular
    human_sim_matrix = np.loadtxt(path.join(base_path, p['human_data_file']))
    sim_m_size = np.shape(human_sim_matrix)[0]
    human_sim_vector = human_sim_matrix[np.triu_indices(sim_m_size, 1)]

    max_topics = lsi.num_topics

    logger.info("iterate from %d to %d dimensions (stepsize: %d)" %
                (p['min_dim'], max_topics, p['dim_step']))

    iter_range = range(p['min_dim'], max_topics, p['dim_step'])
    res = np.zeros(len(iter_range))
    for k, l in enumerate(iter_range):

        # do the lower dimensionality transformation
        lsi.num_topics = l
        corpus_lsi = lsi[corpus_pre]

        # compute pairwise similarity matrix of transformed corpus
        sim_matrix = np.zeros((len(corpus_lsi), len(corpus_lsi)))
        for i, par1 in enumerate(corpus_lsi):
            for j, par2 in enumerate(corpus_lsi):
                sim_matrix[i, j] = matutils.cossim(par1, par2)
        sim_vector = sim_matrix[np.triu_indices(len(corpus_lsi), 1)]

        # compute correlations
        cor = np.corrcoef(sim_vector, human_sim_vector)
        logger.info("step %d: correlation with lee data: %f" % (k, cor[0, 1]))
        res[k] = cor[0, 1]

    plt.figure()
    plt.plot(iter_range, res)
    plt.savefig(os.path.join(output_dir, 'cor_plot.' + p['plot_extension']))
    plt.close()
    np.save(path.join(output_dir, 'model_dim_res.npy'), res)

    dif = datetime.now() - start
    logger.info("finished after %d days and %d secs" % (dif.days, dif.seconds))
Exemple #19
0
def load_or_build_lda_model(conn):

	try:

		# load vocab dictionary
		vocab = SaveLoad.load('../model_development/saved_models/model_user_day_room_10.id2word')

		# load model
		ldamodel = LdaModel.load('../model_development/saved_models/model_user_day_room_10')

		print('Pretrained lda model loaded!')

	except:

		# query for aggregating texts per user per room per day
		sql = """
		select group_concat(lemma, ' ') as lemma
		from lemmas
			join (
				select chat_id
					, from_userid
					, strftime('%Y-%m-%d', sent_at) as sent_day
					, room_id
				from chats
				) using (chat_id)
		where nullif(lemma, '') is not null
		group by from_userid
			, sent_day
			, room_id
		order by random();
		"""

		# get vocabulary
		MIN_OCCURANCE = 100
		vocab = Dictionary([pd.read_sql('select word from words where freq >= {}'.format(MIN_OCCURANCE), conn)['word'].tolist()])

		# models for different number of topics
		N_EPOCHS = 10
		n_topics = 10
		style = 'user_day_room'

		# init model
		lda_model = LdaModel(
			id2word=vocab,
			num_topics=n_topics, 
			alpha='auto',
			per_word_topics=True)

		# do training
		print('training model_{0}_{1}'.format(style, n_topics))
		for epoch in range(N_EPOCHS):
			print('\tepoch', epoch, '...', end='\r')
			for chunk in pd.read_sql(sql, conn, chunksize=10000):
				chunk_corpa = [vocab.doc2bow(text) for text in chunk['lemma'].str.split(' ').tolist()]
				lda_model.update(chunk_corpa)
			print('\tepoch', epoch, '... done!')

		# Save model to disk.
		lda_model.save("saved_models/model_{0}_{1}".format(style, n_topics))

	return vocab, ldamodel
Exemple #20
0
def main(args):

    _starttime = time.clock()
    if args.root == 'test':
        args.root = safire.get_test_data_root()
        args.name = 'test-data'
    logging.info('Initializing dataset loader with root %s, name %s' % (args.root, args.name))
    loader = MultimodalShardedDatasetLoader(args.root, args.name)

    if args.clear:
        raise NotImplementedError('Cleaning not implemented properly through'
                                  ' a loader/layout object.')

    if args.flatten:
        if args.f_text is None or args.f_images is None:
            raise argparse.ArgumentError('Must provide --f_text and --f_images'
                                         ' when attempting to flatten.')

        logging.info('Loading text pipeline and casting to dataset...')
        t_pipeline_name = loader.pipeline_name(args.f_text)
        t_pipeline = SaveLoad.load(t_pipeline_name)
        t_data = Dataset(t_pipeline)

        logging.info('Loading image pipeline and casting to dataset...')
        i_pipeline_name = loader.pipeline_name(args.f_images)
        i_pipeline = SaveLoad.load(i_pipeline_name)
        i_data = Dataset(i_pipeline)

        logging.info('Creating composite dataset...')
        mm_data = CompositeDataset((t_data, i_data), names=('text', 'img'),
                                   aligned=False)

        logging.info('Flattening dataset...')
        t2i_file = os.path.join(loader.root,
                                loader.layout.textdoc2imdoc)
        flatten_indexes = compute_docname_flatten_mapping(mm_data, t2i_file)
        flatten = FlattenComposite(mm_data, indexes=flatten_indexes)
        flat_mm_data = flatten[mm_data]

        if not args.label:
            logging.info('Generating flattened label automatically...')
            args.label = '__'.join([args.f_text, args.f_images])
            logging.info('    Generated label: {0}'.format(args.label))

        logging.info('Serializing flattened data...')
        serialization_name = loader.pipeline_serialization_target(args.label)
        serializer = Serializer(flat_mm_data, ShardedCorpus, serialization_name)
        pipeline = serializer[flat_mm_data]

        logging.info('Saving pipeline...')
        pipeline_name = loader.pipeline_name(args.label)
        pipeline.save(fname=pipeline_name)

        return

    if args.input_label is not None:
        logging.info('Loading corpus with label %s' % args.input_label)
        pipeline_fname = loader.pipeline_name(args.input_label)
        pipeline = SaveLoad.load(pipeline_fname)

        logging.info('Loaded corpus report:\n')
        logging.info(log_corpus_stack(pipeline))

    elif args.images:
        logging.info('Reading raw image data.')
        image_file = os.path.join(args.root, loader.layout.image_vectors)
        icorp = ImagenetCorpus(image_file, delimiter=';',
                               dim=4096, label='')
        pipeline = icorp

    else:
        logging.info('Reading raw text data.')
        vtargs = {}
        if args.label:
            logging.info('VTextCorpus will have label %s' % args.label)
            vtargs['label'] = args.label
        if args.pos:
            logging.info('Constructing POS filter with values {0}'
                         ''.format(list(args.pos)))
            vtargs['token_filter'] = PositionalTagTokenFilter(list(args.pos), 0)
        if args.pfilter:
            logging.info('Constructing positional filter: %d.' % args.pfilter)
            # If a fixed number of sentences is requested, use this.
            if args.pfilter % 1 == 0:
                args.pfilter = int(args.pfilter)
            vtargs['pfilter'] = args.pfilter
            if args.pfilter_fullfreq:
                vtargs['pfilter_full_freqs'] = args.pfilter_fullfreq
        if args.filter_capital:
            vtargs['filter_capital'] = True
        vtargs['tokens'] = args.tokens
        vtargs['sentences'] = args.sentences

        if args.tokens or args.sentences:
            # This already happens automatically inside VTextCorpus, but it
            # raises a warning we can avoid if we know about this in advance.
            vtargs['precompute_vtlist'] = False

        logging.info(u'Deriving corpus from loader with vtargs:\n{0}'.format(
            u'\n'.join(u'  {0}: {1}'.format(k, v)
                       for k, v in sorted(vtargs.items())))
        )

        vtcorp = loader.get_text_corpus(vtargs)
        # VTextCorpus initialization is still the same, refactor or not.
        logging.info('Corpus: %s' % str(vtcorp))
        logging.info('  vtlist: %s' % str(vtcorp.input))

        pipeline = vtcorp  # Holds the data

    if args.tfidf:

        tfidf = TfidfModel(pipeline)
        pipeline = tfidf[pipeline]

    if args.top_k is not None:
        if args.images:
            logging.warn('Running a frequency-based transformer on image data'
                         ' not a lot of sense makes, hmm?')

        logging.info('Running transformer with k=%i, discard_top=%i' % (
            args.top_k, args.discard_top))

        if args.profile_transformation:
            report, transformer = safire.utils.profile_run(_create_transformer,
                                                           pipeline,
                                                           args.top_k,
                                                           args.discard_top)
            # Profiling output
            print report.getvalue()
        else:
            transformer = FrequencyBasedTransformer(pipeline,
                                                    args.top_k,
                                                    args.discard_top)

        pipeline = transformer[pipeline]

    if args.post_tfidf:
        post_tfidf = TfidfModel(pipeline)
        pipeline = post_tfidf[pipeline]

    if args.word2vec is not None:
        logging.info('Applying word2vec transformation with embeddings '
                     '{0}'.format(args.word2vec))
        w2v_dictionary = get_id2word_obj(pipeline)
        # Extracting dictionary from FrequencyBasedTransform supported
        # through utils.transcorp.KeymapDict
        pipeline = convert_to_gensim(pipeline)
        word2vec = Word2VecTransformer(args.word2vec,
                                       w2v_dictionary,
                                       op=args.word2vec_op)
        pipeline = word2vec[pipeline]

    if args.w2v_filter_empty:
        print 'Applying word2vec empty doc filtering.'
        document_filter = DocumentFilterTransform(zero_length_filter)
        pipeline = document_filter[pipeline]

    if args.uniform_covariance:
        ucov = LeCunnVarianceScalingTransform(pipeline)
        pipeline = ucov[pipeline]

    if args.tanh:
        pipeline = convert_to_gensim(pipeline)
        tanh_transform = GeneralFunctionTransform(numpy.tanh,
                                                  multiplicative_coef=args.tanh)
        pipeline = tanh_transform[pipeline]

    if args.capped_normalize is not None:
        logging.info('Normalizing each data point to '
                     'max. value %f' % args.capped_normalize)
        cnorm_transform = CappedNormalizationTransform(pipeline,
                                                        args.capped_normalize)
        pipeline = cnorm_transform[pipeline]

    if args.normalize is not None:
        logging.info('Normalizing each data point to %f' % args.normalize)
        norm_transform = NormalizationTransform(args.normalize)
        pipeline = norm_transform[pipeline]

    logging.info('Serializing...')
    # Rewrite as applying a Serializer block.

    if isinstance(pipeline, VTextCorpus):
        logging.info('Checking that VTextCorpus dimension is available.')
        #if not pipeline.precompute_vtlist:
        #    logging.info('    ...to get dimension: precomputing vtlist.')
        #    pipeline._precompute_vtlist(pipeline.input)
        if pipeline.n_processed < len(pipeline.vtlist):
            logging.info('Have to dry_run() the pipeline\'s VTextCorpus,'
                         'because we cannot derive its dimension.')
            if args.serialization_format == 'gensim':
                logging.info('...deferring dimension check to serialization,'
                             ' as the requested serialization format does not'
                             ' need dimension defined beforehand.')
            else:
                pipeline.dry_run()

    data_name = loader.pipeline_serialization_target(args.label)
    logging.info('  Data name: {0}'.format(data_name))

    serializer_class = ShardedCorpus

    # Here, the 'serializer_class' will not be called directly. Instead,
    # a Serializer block will be built & applied. (Profiling serialization
    # currently not supported.)
    serialization_start_time = time.clock()
    logging.info('Starting serialization: {0}'.format(serialization_start_time))
    sparse_serialization = False
    gensim_serialization = False
    if args.serialization_format == 'sparse':
        sparse_serialization = True
    elif args.serialization_format == 'gensim':
        gensim_serialization = True
    elif args.serialization_format != 'dense':
        logging.warn('Invalid serialization format specified ({0}), serializing'
                     ' as dense.'.format(args.serialization_format))
    serializer_block = Serializer(pipeline, serializer_class,
                                  data_name,
                                  dim=dimension(pipeline),
                                  gensim_serialization=gensim_serialization,
                                  sparse_serialization=sparse_serialization,
                                  overwrite=(not args.no_overwrite),
                                  shardsize=args.shardsize)
    serialization_end_time = time.clock()
    logging.info('Serialization finished: {0}'.format(serialization_end_time))

    logging.debug('After serialization: n_processed = {0}'
                  ''.format(safire.utils.transcorp.bottom_corpus(pipeline).n_processed))

    pipeline = serializer_block[pipeline]

    assert isinstance(pipeline, SwapoutCorpus), 'Serialization not applied' \
                                                ' correctly.'

    if args.index:
        iloader = IndexLoader(args.root, args.name)
        index_name = iloader.output_prefix(args.label)
        logging.info('Building index with name {0}'.format(index_name))
        similarity_transformer = SimilarityTransformer(pipeline, index_name)
        # Should the pipeline get transformed? Or do we only want
        # the transformer?
        # What is the use case here? We need the *transformer*, not the
        # transformed data (that would be just the self-similarity of our
        # dataset), so we need to get some new input. We can retrieve
        # the pipeline.obj and lock the transformer onto another pipeline.
        pipeline = similarity_transformer[pipeline]

    logging.info('Corpus stats: {0} documents, {1} features.'.format(
        len(pipeline),
        safire.utils.transcorp.dimension(pipeline)))

    if not args.no_save_corpus:
        obj_name = loader.pipeline_name(args.label)
        logging.info('Saving pipeline to {0}'.format(obj_name))
        pipeline.save(obj_name)

    # HACK: logging word2vec OOV
    if args.word2vec:
        # Report out-of-vocabulary statistics
        #oov_report = word2vec.report_oov()
        #logging.info(u'OOV report:\n%s' % oov_report)
        word2vec.log_oov()

    if args.word2vec_export:
        word2vec_to_export = word2vec.export_used()
        embeddings_dict = word2vec_to_export.embeddings
        with open(args.word2vec_export, 'wb') as w2v_export_handle:
            cPickle.dump(embeddings_dict, w2v_export_handle, protocol=-1)

    _endtime = time.clock()
    _totaltime = _endtime - _starttime
    logging.info('Total main() runtime: %d s' % int(_totaltime))
    return
# 		body = str(art.loc[i,'content'])+ " " +str(art.loc[i,'title'])
# 		cleaned = clean(body)
# 		cleaned = word_tokenize(cleaned)
# 		labeled_sentances.append(cleaned)
# 	except:
# 		print('Error Index:',index)

# dct = Dictionary(labeled_sentances)  # fit dictionary

# dct.save("doc.dic")

####################################################################################
####################################################################################
####################################################################################

dct = SaveLoad.load("doc.dic")


def single_doc(doc_tokenized):
    tags = pos_tag(doc_tokenized)

    pos_score, neg_score = 0, 0
    for t in tags:
        word = t[0]
        pos = t[1][0]

        if pos == 'J':
            part = 'a'
        elif pos == 'N':
            part = 'n'
        elif pos == 'R':
Exemple #22
0
        x[0]
        for x in get_womanly_words(model, adjectives_list, woman_words_list)
    ]

    print("\n{}:".format(source_mame))
    print("\nManly adjectives: \n\t{}".format(", ".join(manly_words[:8])))
    print("\nWomanly adjectives: \n\t{}".format(", ".join(womanly_words[:8])))

    return manly_words, womanly_words


if __name__ == '__main__':
    books_model_path = "../data/w2v_models/book_v1_model"
    tv_show_model_path = "../data/w2v_models/tv_show_v1_model"

    books_model = SaveLoad.load(books_model_path)
    tv_show_model = SaveLoad.load(tv_show_model_path)

    test_words_similarity(books_model, tv_show_model)

    # dim_reduction(books_model, tv_show_model)

    books_manly_words, books_womanly_words = get_similart_words_embd(
        books_model, source_mame='Books')
    tv_manly_words, tv_womanly_words = get_similart_words_embd(
        tv_show_model, source_mame='TV Show')

    similarity_df = pd.DataFrame([
        books_manly_words, books_womanly_words, tv_manly_words,
        tv_womanly_words
    ]).T
Exemple #23
0
def main(args):
    logging.info('Executing dataset_stats.py...')

    loader = MultimodalShardedDatasetLoader(args.root, args.name)

    # Loading and/or computing
    if not args.load:
        dataset_name = loader.pipeline_name(args.dataset)
        dataset = SaveLoad.load(dataset_name)
        dataset = convert_to_dense(dataset)
        # if args.text:
        #     wrapper_dataset_name = loader.pipeline_name(args.dataset)
        #     wrapper_dataset = SaveLoad.load(wrapper_dataset_name)
        #     dataset = wrapper_dataset.data
        #     vtcorp = wrapper_dataset.vtcorp
        #     print 'Dimension of underlying text data: %d' % dimension(vtcorp)
        #     print 'Dimension of dataset: %d' % dataset.dim
        #     # The ShardedDataset, not the text-modality wrapper
        # else:
        #     dataset = loader.load_img(args.dataset).data

        logging.info('Loaded dataset: %d items, dimension %d' % (len(dataset), dimension(dataset)))
        report, stats = safire.utils.profile_run(do_stats_init, args, dataset)
    else:
        with open(args.load) as input_handle:
            stats = cPickle.load(input_handle)

    stats.report()
    #stats.raw_cell_histogram()
    #stats.cell_mass_histogram(n=100)
    #stats.cell_cumulative_mass(n=100)

    max_feature = list(stats.feature_totals).index(max(stats.feature_totals))
    #stats.feature_histogram(0, stats.n_items_processed, whole_dataset=True)
    #stats.feature_histogram(max_feature,
    #                       stats.n_items_processed, whole_dataset=True)

    #stats.item_histogram(0)
    #stats.item_histogram(3)
    #stats.item_histogram(117)
    #stats.feature_lecunn_covariances()

    #stats.nnz_histogram()


    inspected_features = sorted([ numpy.random.choice(stats.dataset_dim,
                                                      replace=False)
                                  for _ in range(100)])
    #inspection_matrix = stats.pairwise_feature_matrix(inspected_features,
    #                                       safire.utils.matutils.maxn_sparse_rmse)
    #safire.utils.heatmap_matrix(inspection_matrix, 'MaxNormalized dataset RMSE')
    #pairwise_avg = numpy.average(inspection_matrix)

    logging.info('Sampling raw matrix...')

    n_raw_samples = min(len(dataset), 1000)
    raw_matrix = numpy.array([stats.dataset[idx]
                              for idx in stats.d_idxs[:n_raw_samples]])

    if args.activation_histogram:
        logging.info('Computing histogram of activations...')
        stats.rich_histogram(raw_matrix.ravel(), n_bins=100,
                             with_zero=False,
                             title='Feature activation histogram')

    if args.average_activations:
        logging.info('Computing average activations...')
        feature_totals = numpy.array(stats.feature_totals)
        avg_feature_totals = feature_totals / numpy.sum(feature_totals)
        plt.plot(sorted(avg_feature_totals))
        plt.hist(avg_feature_totals, bins=20, color='red', histtype='step',
                 orientation='horizontal')
        plt.title('Sorted feature means')
        plt.show()

    if args.normalize_covariance:
        logging.info('Normalizing covariance...')
        covariances = numpy.sum(raw_matrix ** 2, axis=0) / raw_matrix.shape[0]
        #print covariances[:10]
        scaled_raw_matrix = scale_to_unit_covariance(raw_matrix)
        scaled_covariances = numpy.sum(scaled_raw_matrix ** 2, axis=0) / scaled_raw_matrix.shape[0]

        plt.figure()
        plt.plot(covariances, color='b')
        plt.plot(scaled_covariances, color='r')
        plt.show()

        #stats.feature_histogram(max_feature, n_bins=100, whole_dataset=True)
        #stats.rich_histogram(raw_matrix[:,max_feature], n_bins=100)
        #stats.rich_histogram(scaled_raw_matrix[:,max_feature], n_bins=100)

        #stats.rich_histogram(raw_matrix[:,0], n_bins=100)
        #stats.rich_histogram(scaled_raw_matrix[:,0], n_bins=100)

        safire.utils.heatmap_matrix(numpy.absolute(scaled_raw_matrix),
                                    title='UCov. dataset heatmap',
                                    with_average=True,
                                    colormap='afmhot',
                                    vmin=0.0, vmax=stats.maximum)


    if args.raw:
        safire.utils.heatmap_matrix(numpy.absolute(raw_matrix),
                                    title='Dataset heatmap',
                                    with_average=True,
                                    colormap='afmhot',
                                    vmin=0.0, vmax=stats.maximum)

        stats.rich_histogram(raw_matrix.ravel())

    if args.correlation:
        logging.info('Computing correlation...')
        if args.normalize_covariance:
            corrcoef_matrix = numpy.corrcoef(scaled_raw_matrix, rowvar=0)
        else:
            corrcoef_matrix = numpy.corrcoef(raw_matrix, rowvar=0)

        print 'Average correlation: %f' % numpy.average(corrcoef_matrix)

        plt.figure(facecolor='white', figsize=(8,6))
        plt.pcolormesh(numpy.absolute(corrcoef_matrix),
                       #title='Pearson C.Coef. heatmap',
                       cmap='afmhot',
                       vmin=0.0, vmax=1.0)
        plt.colorbar()
        plt.xlim([0,corrcoef_matrix.shape[1]])
        plt.ylim([0,corrcoef_matrix.shape[0]])
        plt.show()

    if args.tanh:
        logging.info('Plotting tanh transformation...')
        tanh_matrix = numpy.tanh(raw_matrix / args.tanh)
        stats.rich_histogram(tanh_matrix, n_bins=20, title='Tanh matrix histogram.')

    if args.hyper:
        logging.info('Plotting hyperbolic transformation...')
        hyp_matrix = raw_matrix / (raw_matrix + args.hyper)
        stats.rich_histogram(hyp_matrix, n_bins=100, title='x/(1+x) matrix histogram.')

    if args.sparsity:
        logging.info('Computing sparsity...')
        # One entry per feature, counts how many non-zero elements are there
        # in each column of the raw matrix.
        num_nnz  = numpy.array([ len([i for i in raw_matrix[:,f] if f != 0])
                                  for f in range(raw_matrix.shape[1]) ],
                               dtype=numpy.float32)
        p_nnz = num_nnz / float(raw_matrix.shape[0])
        plt.plot(sorted(p_nnz))
        plt.hist(p_nnz, bins=20, histtype='stepped', color='r',
                 orientation='horizontal')


    #print 'Pairwise average for %d random features: %f' % (len(inspected_features),
    #                                                       pairwise_avg)

    if args.save:
        with open(args.save, 'w') as output_handle:
            cPickle.dump(stats, output_handle, protocol=-1)

    logging.info('Exiting dataset_stats.py.')
from gensim import corpora, models, matutils
import numpy as np
from bson.objectid import ObjectId
import pymongo
import json

client = pymongo.MongoClient()
db = client.meetup
#groups_col=db.groups
groups_clean_extra_col = db.groups_clean_extra

_id_df = pickle.load(open( './controllers/lsi_id_df.pkl', "rb" ))
count_vectorizer = pickle.load(open( './controllers/count_vectorizer.pkl', "rb" ))
tfidf = models.tfidfmodel.TfidfModel.load('./controllers/tfidf.pkl')
lsi = models.LsiModel.load('./controllers/lsi')
index = SaveLoad.load('./controllers/index.pkl')

def recommend_texts(text,_id_df,count_vectorizer,tfidf,lsi,index):
	# _id_df = pickle.load(open( 'lsi_id_df.pkl', "rb" ))
	# count_vectorizer = pickle.load(open( 'count_vectorizer.pkl', "rb" ))
	# tfidf = models.tfidfmodel.TfidfModel.load('tfidf.pkl')
	# lsi = models.LsiModel.load('lsi')
	# #corpus_lsi = models.LsiModel.load('lsi_corpus')
	# index = SaveLoad.load('index.pkl')
	metis_vecs = count_vectorizer.transform(np.array(text)).transpose()
	metis_corpus= matutils.Sparse2Corpus(metis_vecs)
	metis_tfidf_corpus = tfidf[metis_corpus]
	metis_lsi_corpus = lsi[metis_tfidf_corpus]
	metis_doc_vecs = [doc for doc in metis_lsi_corpus]
	index.num_best=500
	my_index=index[metis_doc_vecs]
def main():
    parser = argparse.ArgumentParser(
        description=
        'prints the topics of a LDA instance, sorted decreasing by thei average probabilities in the collection; prints some stats of the appereances of the most important terms of the most important topics'
    )
    parser.add_argument('--bow',
                        type=argparse.FileType('r'),
                        help='path to input bow file (.mm/.mm.bz2)',
                        required=True)
    parser.add_argument('--model-prefix',
                        type=argparse.FileType('r'),
                        help='prefix of input binary lda model files',
                        required=True)

    args = parser.parse_args()
    input_bow_path = args.bow.name
    input_model_prefix = args.model_prefix.name

    logger.info('running with:\n{}'.format(
        pformat({
            'input_bow_path': input_bow_path,
            'input_model_prefix': input_model_prefix
        })))

    logger.info('loading bow corpus from {}'.format(input_bow_path))
    bow = MmCorpus(input_bow_path)
    logger.info('loading topic model from {}'.format(input_model_prefix))
    model = SaveLoad.load(input_model_prefix)
    logger.info('loaded {} docs, {} topics'.format(bow.num_docs,
                                                   model.num_topics))

    logger.info('generating sparse document-topic-matrix')
    document_topic_probs = dok_matrix((bow.num_docs, model.num_topics),
                                      dtype='d')
    for docid, topics in enumerate(model[bow]):
        for topicid, prob in topics:
            document_topic_probs[docid, topicid] = prob
    document_topic_probs = document_topic_probs.tocsr()

    logger.info(
        'calculating average topic probabilities of the document collection')
    topics_avg = document_topic_probs.sum(axis=0) / bow.num_docs
    topics_max = document_topic_probs.max(axis=0).todense().tolist()[0]
    logger.debug('avg {} shape {}'.format(topics_avg, topics_avg.shape))
    logger.debug('max {} len {}'.format(topics_max, len(topics_max)))
    topics_avg = list(enumerate(topics_avg.tolist()[0]))
    topics_avg.sort(key=lambda t: t[1], reverse=True)

    num_printed_terms = 10
    logger.info('topics with highest average probabilities')
    for topicid, topic_avg_prob in topics_avg:
        logger.info(
            'topic ID={0}, avgprob={1:.4f}, maxprob={2:.4f}, terms:\n{3}'.
            format(topicid, topic_avg_prob, topics_max[topicid],
                   model.print_topic(topicid, topn=num_printed_terms)))

    num_top_topics = min(5, len(topics_avg))
    num_top_terms = 20
    top_topics = [topicid
                  for topicid, topic_avg in topics_avg][:num_top_topics]
    logger.info(
        'calculating stats of top-{}-topics {} with top-{}-terms per topic'.
        format(num_top_topics, top_topics, num_top_terms))
    term_topics = defaultdict(
        list
    )  # mapping termid->topicids für alle termids, die in top-k von irgendwelchen topics enthalten
    for topicid in top_topics:
        for termid, prob in get_topic_terms(model, topicid,
                                            topn=num_top_terms):
            term_topics[termid].append(topicid)
    term_topics = dict(term_topics)

    num_different_docs_per_topic = {topicid: 0 for topicid in top_topics}
    sum_bow_values_per_topic = {topicid: 0 for topicid in top_topics}
    for docid, document_term_bow in enumerate(bow):
        doc_topics = set()
        for termid, bow_value in document_term_bow:
            if termid in term_topics:
                for topicid in term_topics[termid]:
                    doc_topics.add(topicid)
                    sum_bow_values_per_topic[topicid] += bow_value
        for topicid in doc_topics:
            num_different_docs_per_topic[topicid] += 1

    for topicid in top_topics:
        logger.info(
            'top-{}-terms of topic {} occure {} times in collection'.format(
                num_top_terms, topicid,
                int(sum_bow_values_per_topic[topicid])))
        logger.info(
            'top-{}-terms of topic {} occure {} different documents'.format(
                num_top_terms, topicid, num_different_docs_per_topic[topicid]))
Exemple #26
0
                    # words = list(gensim.utils.simple_tokenize(line))
                    if words:
                        # print(words)
                        # yield words
                        f1.write(" ".join(words))
                        f1.write("\n")
                        # yield list(gensim.models.word2vec.LineSentence(name))


def make_bi_tri(paths, tri=False):
    sentences = PathLineSentences(paths)
    phases = Phrases(sentences)
    bigram = Phraser(phases)
    bigram.save()
    if tri:
        triphases = Phrases(bigram[sentences])
        trigram = Phraser(triphases)
        trigram.save()


if __name__ == "__main__":
    tar = tarfile.open("../saos-dump-23.02.2018.tar.gz", "r:gz")
    give_me(tar)
    tar.close()
    from_file("data", lower=True)
    sentences = PathLineSentences(os.path.join(os.getcwd(), "preprocesssed"))
    bigram = SaveLoad.load("bigram")
    trigram = SaveLoad.load("trigram")
    word = Word2Vec(trigram[bigram[sentences]], window=5, sg=0, size=300, min_count=3, workers=7)
    word.save("counted_model")
Exemple #27
0
def load_phraser(modelpath: PathType, n: int) -> PhraserType:
    modelpath = os.path.join(modelpath, f"{n}gramsphraser")
    phraser = gensimSaveLoad.load(modelpath)
    return phraser