Ejemplo n.º 1
0
 def __init__(self):
     self.dictionary = Dictionary.load(app.config["RCMDR_DICT"])
     self.corpus = corpora.MmCorpus(app.config["RCMDR_CORPUS"])
     self.tfidf = TfidfModel.load(app.config["RCMDR_TFIDF_MODEL"])
     self.lda_model = LdaModel.load(app.config["RCMDR_LDA_MODEL"])
     self.lsi_model = LsiModel.load(app.config["RCMDR_LSI_MODEL"])
     self.lda_index = Similarity.load(app.config["RCMDR_LDA_INDEX"])
     self.lsi_index = Similarity.load(app.config["RCMDR_LSI_INDEX"])
     self.job_labels = {
         int(k): v
         for k, v in (line.split("=") for line in open(app.config["RCMDR_JOB_LABELS"]).read().strip().split("\n"))
     }
Ejemplo n.º 2
0
    def __init__(self, model_prefix=None, num_best=None):
        self.model_prefix = model_prefix
        self.num_best = num_best
        if self.model_prefix is None:
            raise ValueError("model_prefix must be specified")

        logger.info("ESA: Loading word dictionary...")
        self.dictionary = Dictionary.load_from_text(model_prefix +
                                                    '_wordids.txt.bz2')

        logger.info("ESA: Loading document name map...")
        self.article_dict = utils.unpickle(model_prefix +
                                           '_bow.mm.metadata.cpickle')

        logger.info("ESA: Loading TF-IDF model...")
        self.tfidf = TfidfModel.load(model_prefix + '.tfidf_model')

        logger.info("ESA: Loading similarity index...")
        self.similarity_index = Similarity.load(model_prefix +
                                                '_similarity.index',
                                                mmap='r')

        #logger.info("ESA: Preloading reverse indexes...")
        #self.similarity_index.preload_reverse_index()

        logger.info("ESA: Finished loading model files.")
Ejemplo n.º 3
0
 def create_similarity_index(self):
     if not os.path.isfile(self.similarity_file):
         self.similarity_index = Similarity('./LSM/', self.corpora,
                                            self.num_topics)
         self.similarity_index.save(self.similarity_file)
     else:
         self.similarity_index = Similarity.load(self.similarity_file)
Ejemplo n.º 4
0
def initiate_recommender():
    # Retrieve all the necessary files for the recommender system
    baseDir = settings.BASE_DIR

    # Load dictionary and corpus
    dictFile = baseDir + "/static/data/DBLP_Dictionary.dict"
    corpusFile = baseDir + "/static/data/DBLP_Corpus.mm"

    dictionary = corpora.Dictionary.load(dictFile)
    corpus = corpora.MmCorpus(corpusFile)

    # Load the TF-IDF model
    tfidfFile = baseDir + "/static/data/TF-IDF"

    tfidf = models.TfidfModel().load(tfidfFile)

    # Load the Gensim similarity index
    indexFile = baseDir + "/static/data/Index"
    sims = Similarity.load(indexFile)

    # If matrix fits in memory, use this instead and comment out previous two lines
    #sims = MatrixSimilarity(tfidf[corpus], num_features=(len(dictionary)))

    # Point to the text csv file
    textFile = baseDir + "/static/data/Text.csv"

    # Load ID dataframe from recommender
    paperIDs = baseDir + "/static/data/AbsID.csv"
    cols = ["paperID"]
    dfIDs = pd.read_csv(paperIDs, names=cols, header=None)

    return dictionary, corpus, tfidf, sims, textFile, dfIDs
	def run(self):
		if self.clean_level in ('raw','clean','stopwords'):
			kind = self.clean_level
		else:
			kind = 'stopwords'

		# Guardamos las similitudes en un archivo con un formato sencillo
		# NOTA: EL ÍNDICE YA DE POR SÍ GUARDA LAS SIMILITUDES. NO ES NECESARIO CALCULARLAS DE NUEVO
		for idioma, salida in self.output()['langs'].iteritems():
			file_list = os.listdir(os.path.join(self.txt_dir,kind,idioma))
			for n_topics, o in salida.iteritems():
				index = Similarity.load(self.input()['langs'][idioma][n_topics]['lsi-index'].path)

				# JSON
				sims = index2dict(index, file_list, num_sims=self.num_similar_docs)
				with o['json'].open('w') as f:
					json.dump(sims, f)

				# HTML + CSV
				s = u''
				net = pd.DataFrame(columns=['from_name', 'to_name', 'sim'])
				for book, v in sims.iteritems():
					s += u'-------------------------------------------\n'
					s += u'### %s\n\n' % (book)
					s += u'| Ranking | Libro | Similitud |\n|:--------:|:-------|-------------:|\n'''
					for rank, attrs in v.iteritems():
						s += u'| %d | %s | %f |\n' % (rank, attrs['name'], round(attrs['similarity'],3))
						net = net.append(pd.DataFrame({'from_name':[book], 'to_name':[attrs['name']], 'sim':[attrs['similarity']]}))
					s += u'\n\n'
				md = markdown.markdown(s, extensions=['markdown.extensions.tables'])
				books = sorted(list(set(net['from_name']).union(net['to_name'])))
				ids = {v:i for i,v in enumerate(books)}
				net['from'] = [ids[k] for k in net['from_name']]
				net['to'] = [ids[k] for k in net['to_name']]

				with o['html'].open('w') as f:
					f.write(md)
				with o['csv'].open('w') as f:
					net.to_csv(f, index=False)

				# Red (en R)
				tempname = 'net_temp0.html'
				i = 1
				while os.path.exists(tempname):
					tempname = 'net_temp%d.html' % i
					i += 1
					if i >= 100:
						print 'ERROR: No se puede crear la red temporal... Checa que no exista un archivo llamado %s en esta carpeta y que tienes permisos de escritura...' % tempname
						break
				subprocess.call(['itam-d3-network.R', '--input', o['csv'].path, '--output', tempname, '--max_links', str(self.num_similar_docs), '--min_sim', str(self.min_similarity)])
				print 'USER INFO: Creando archivo temporal: ' + tempname
				shutil.move(tempname, o['net'].path)
				print 'USER INFO: Movimiento listo, %s --> %s' % (tempname, o['net'].path)
				
				if os.path.exists(tempname):
					os.remove(tempname)
Ejemplo n.º 6
0
    def load(self, path):
        if type(path) == str:
            path = Path(path)

        with open(path / 'paragraph-ids.txt') as f:
            self.paragraph_ids = [paragraph_id.strip() for paragraph_id in f]

        dictionary_path = str(path / 'dct.pkl')
        self.dictionary = Dictionary.load(dictionary_path)

        index_path = str(path / 'indexes' / 'master-index')
        self.index = Similarity.load(index_path)
        self.index.num_best = self.num_best
Ejemplo n.º 7
0
	def run(self):
		if self.clean_level in ('raw','clean','stopwords'):
			kind = self.clean_level
		else:
			kind = 'stopwords'

		# Guardamos las similitudes en un archivo con un formato sencillo
		# NOTA: EL ÍNDICE YA DE POR SÍ GUARDA LAS SIMILITUDES. NO ES NECESARIO CALCULARLAS DE NUEVO
		for idioma, salida in self.output()['langs'].iteritems():
			file_list = os.listdir(os.path.join(self.txt_dir,kind,idioma))
			for n_topics, o in salida.iteritems():
				index = Similarity.load(self.input()['langs'][idioma][n_topics]['lsi-index'].path)
				sims = arrange_similarities(index, file_list, num_sims=self.num_similar_docs)
				sims = '\n'.join(['\t'.join([str(i) for i in t]) for t in sims])
				with o.open('w') as f:
					f.write(sims)
Ejemplo n.º 8
0
    def __init__(self, model_prefix = None, num_best = None):
        self.model_prefix = model_prefix
        self.num_best = num_best
        if self.model_prefix is None:
            raise ValueError("model_prefix must be specified")

        logger.info("ESA: Loading word dictionary...")
        self.dictionary = Dictionary.load_from_text(model_prefix + '_wordids.txt.bz2')

        logger.info("ESA: Loading document name map...")
        self.article_dict = utils.unpickle(model_prefix + '_bow.mm.metadata.cpickle')

        logger.info("ESA: Loading TF-IDF model...")
        self.tfidf = TfidfModel.load(model_prefix + '.tfidf_model')

        logger.info("ESA: Loading similarity index...")
        self.similarity_index = Similarity.load(model_prefix + '_similarity.index', mmap='r')

        #logger.info("ESA: Preloading reverse indexes...")
        #self.similarity_index.preload_reverse_index()

        logger.info("ESA: Finished loading model files.")
Ejemplo n.º 9
0
def load_precomputed():
  global questions
  global documents
  global dct
  global corpus
  global tfidf_model
  global corpus_tfidf
  global index

  if questions is None:
    questions = pickle.load(open('precompute/questions.pkl', 'rb'))
    logger.info("Loaded questions")

  if documents is None:
    documents = pickle.load(open('precompute/documents.pkl', 'rb'))
    logger.info("Loaded tokenized questions")

  if dct is None:
    dct = pickle.load(open('precompute/dct.pkl', 'rb'))
    logger.info("Loaded dictionary")

  if corpus is None:
    corpus = pickle.load(open('precompute/corpus.pkl', 'rb'))
    logger.info("Loaded corpus")

  if tfidf_model is None:
    tfidf_model = pickle.load(open('precompute/tfidf_model.pkl', 'rb'))
    logger.info("Loaded tfidf model")

  if corpus_tfidf is None:
    corpus_tfidf = pickle.load(open('precompute/corpus_tfidf.pkl', 'rb'))
    logger.info("Loaded tfidf corpus")

  if index is None:
    index = Similarity.load("precompute/similarities.pkl")
    logger.info("Loaded similarities")

  logger.info("Loaded precomputed stuff")
Ejemplo n.º 10
0
def largevisformat(c_file, s_file, o_file):
    import gensim

    ofile = open(o_file, 'w')

    from gensim import corpora
    corpus = gensim.corpora.MmCorpus(c_file)

    if s_file:
        from gensim import similarities
        from gensim.similarities import Similarity
        sim_index = Similarity.load(s_file)

        i = 0
        for doc1 in corpus:
            sims = sim_index[doc1]
            j = 0
            for s in sims:
                dist = s
                ofile.write("%d %d %f\n" % (i, j, dist))
                j += 1
            i += 1
    else:
        ofile.write("%d %d\n" % (corpus.num_docs, corpus.num_terms))
        #ofile.write("%d %d\n" %  (10000, corpus.num_terms))

        counter = 0
        for doc in corpus:
            doc.sort(key=lambda x: x[0], reverse=False)
            ps = 0
            for (s, w) in doc:
                for i in range(0, (s - ps)):
                    ofile.write('0.0 ')
                ofile.write('%f ' % (w))
                ps = s
            counter += 1
Ejemplo n.º 11
0
 def __init__(self, model_filename, index_filename):
     # lemmatizer and model for keyword inference
     self.lemmatize = nltk.stem.WordNetLemmatizer().lemmatize
     self.model = Doc2Vec.load(model_filename)
     # index for similarity queries
     self.index = Similarity.load(index_filename)
Ejemplo n.º 12
0
from gensim import corpora
from gensim.models import LdaModel, LdaMulticore
from gensim.similarities import Similarity

logger = logging.getLogger(__name__)

# Create references

# Load LDA model
lda_model_tfidf = LdaModel.load('./lda_data/lda_model_tfidf.model')

# Load the BoW model
bow_corpus = corpora.MmCorpus('./lda_data/bow_corpus.mm')

# load the index
index = Similarity.load('./lda_data/wine.index')

# Create indices, a vector of wine names to position in data
wine_data = pd.read_csv('./lda_data/df_out_data.csv', index_col=0)
wine_best = wine_data.loc[wine_data['points'] > 90,
                          ['winery', 'variety', 'designation_replace']].sample(
                              5)

indices = pd.Series(wine_data.index,
                    index=wine_data.designation_replace).drop_duplicates()

indices.index.names = ['name']

# Function that takes in wine name as input and outputs most similar wines
# provided by datacamp
Ejemplo n.º 13
0
    model_prefix = sys.argv[1]

    logger.info("running %s" % ' '.join(sys.argv))

    logger.info("Loading word dictionary...")
    dictionary = Dictionary.load_from_text(model_prefix + '_wordids.txt.bz2')
    logger.debug(dictionary)

    logger.info("Loading document name map...")
    article_dict = utils.unpickle(model_prefix + '_bow.mm.metadata.cpickle')

    logger.info("Loading tf-idf model...")
    tfidf = TfidfModel.load(model_prefix + '.tfidf_model')

    logger.info("Loading similarity index...")
    similarity_index = Similarity.load(model_prefix + '_similarity.index', mmap='r')
    similarity_index.use_reverse_index = True

    logger.info("Finished loading model files.")

    mismatches = 0
    for doc_idx in range(0, len(similarity_index)):
        logger.info("Checking doc: %d %s" % (doc_idx, article_dict[doc_idx]))
        rev_doc = scipy.sparse.dok_matrix((1, len(dictionary)), dtype=np.float64)
        fwd_doc = similarity_index.vector_by_id(doc_idx)
        for feature_id, val in enumerate(fwd_doc.toarray().flatten()):
            if val == 0: continue
            feat_rev_docs = similarity_index.docs_by_feature_id(feature_id).toarray().flatten()
            rev_doc[0, feature_id] = feat_rev_docs[doc_idx]
        rev_doc = rev_doc.tocsr()
Ejemplo n.º 14
0
# load models

print "\n    Loading models, etc..\n"
id2word_pgfin = gensim.corpora.Dictionary.load('./data/pgfin.dictionary')
tfidf_model = gensim.models.TfidfModel.load('./data/tfidf_pgfin.model')
lsi_model = gensim.models.LsiModel.load('./data/lsi_pgfin.model')
indexfile = ('./data/ta_index.txt')
queryfile = './queryfiles/queryfile.txt'  # text in corpus
# queryfile = './queryfiles/45vuotta.txt'  # Film review
# queryfile = './queryfiles/tktjohdessee2.txt'  # Ancient essay

# check similarity

print "\n    Load similarity indices.\n"
index = Similarity.load('./data/pgfin_index.index')
index_dense = MatrixSimilarity.load('./data/pgfin_matrixindex.index')

with open(queryfile, 'r') as datafile:
    query = datafile.read()

# vectorize the query text into bag-of-words and tfidf
query_bow = id2word_pgfin.doc2bow(tokenize(query))
query_tfidf = tfidf_model[query_bow]
query_lsi = lsi_model[query_tfidf]

index_dense.num_best = 5


class BookHitValue(object):
Ejemplo n.º 15
0
    input_file, output_prefix = sys.argv[1:3]

    logger.info("running %s" % ' '.join(sys.argv))

    logger.info("Loading word dictionary...")
    dictionary = Dictionary.load_from_text(output_prefix + '_wordids.txt.bz2')
    logger.debug(dictionary)

    logger.info("Loading document name map...")
    article_dict = utils.unpickle(output_prefix + '_bow.mm.metadata.cpickle')

    logger.info("Loading tf-idf model...")
    tfidf = TfidfModel.load(output_prefix + '.tfidf_model')

    logger.info("Loading similarity index...")
    similarity_index = Similarity.load(output_prefix + '_similarity.index', mmap='r')
    similarity_index.use_reverse_index = True
    similarity_index.preload_reverse_index()

    logger.info("Finished loading model files.")

    logger.info("Processing input documents...")

    try:
        infile = open(input_file, 'r')
    except IOError:
        print('cannot open %s' % (input_file,))
        sys.exit(1)

    for docnum, line in enumerate(infile):
        line = line.rstrip()