def generate_model(dictionary, bow_corpus, corpus_path): try: tfidf = TfidfModel.load(corpus_path + 'wiki-tfidf.model') print('tfidf model generated') except: tfidf = TfidfModel() tfidf = TfidfModel(bow_corpus, dictionary) tfidf._smart_save(corpus_path + 'wiki-tfidf.model') pass return tfidf
config.read('config.ini') #TODO: Generalize this step puting corpus_path as your actual corpus #Config file must allow wikipedia, Gutenberg, ... corpus_path = config['WIKI']['en'][1:-1] dictionary = Dictionary.load_from_text( os.path.relpath(corpus_path + '_wordids.txt.bz2')) bow_corpus = MmCorpus(os.path.relpath(corpus_path + '_bow.mm')) try: tfidf = TfidfModel.load(corpus_path + 'wiki-tfidf.model') except: tfidf = TfidfModel() tfidf = TfidfModel(bow_corpus, dictionary) tfidf._smart_save(corpus_path + 'wiki-tfidf.model') pass #testing sentences sentence1 = 'pilar pescado en la tarde es fatal' sentence2 = 'machacar pescado al atardecer es terrible' #Transforming sentences sent1 = sentence1.split() sent2 = sentence2.split() bow_sent1 = dictionary.doc2bow(sent1) bow_sent2 = dictionary.doc2bow(sent2) bow_sent1_tfidf = tfidf[bow_sent1] bow_sent2_tfidf = tfidf[bow_sent2]