Esempio n. 1
0
def process_records(records, fields, target, textmodel=None):
	tokenize = CountVectorizer().build_analyzer()

	input = None
	X = None
	y_labels = []

	for i, record in enumerate(records):
		nums = []
		strs = []
		y_labels.append(record.get(target))

		for field in fields:
			if is_number(record.get(field)):
				nums.append(record[field])
			else:
				strs.append(str(record.get(field) or "").lower())
		if strs:
			if input is None:
				input = StringIO.StringIO()
			print >> input, " ".join(tokenize(" ".join(strs)))
		if nums:
			if X is None:
				X = sp.lil_matrix((len(records),len(nums)))
			X[i] = np.array(nums, dtype=np.float64)

	if input is not None:
		if X is not None:
			X_2 = X.tocsr()
		else:
			X_2 = None

		if isinstance(textmodel,basestring):
			if textmodel == 'lsi':
				corpus = TextCorpus(input)
				textmodel = LsiModel(corpus, chunksize=1000)
			elif textmodel == 'tfidf':
				corpus = TextCorpus(input)
				textmodel = TfidfModel(corpus)
			elif textmodel == 'hashing':
				textmodel = None
				hasher = FeatureHasher(n_features=2 ** 18, input_type="string")
				input.seek(0)
				X = hasher.transform(tokenize(line.strip()) for line in input)
		if textmodel:
			num_terms = len(textmodel.id2word or getattr(textmodel, 'dfs',[]))
			X = corpus2csc(textmodel[corpus], num_terms).transpose()

		if X_2 is not None:
			# print >> sys.stderr, "X SHAPE:", X.shape
			# print >> sys.stderr, "X_2 SHAPE:", X_2.shape
			X = sp.hstack([X, X_2], format='csr')

	elif X is not None:
		textmodel = None
		X = X.tocsr()

	print >> sys.stderr, "X SHAPE:", X.shape

	return X, y_labels, textmodel
 def __init__(self, lex_prob_file, corpus_file):
     self.lex_prob = defaultdict(list)
     for line in open(lex_prob_file):
         chunks = line[:-1].split()
         self.lex_prob[chunks[1]].append(float(chunks[2]))
     corpus = TextCorpus(input=corpus_file)
     self.corpus_freq = FreqDist([word for line in corpus.get_texts() for word in line])
     self.thresholds = [0.01, 0.05, 0.1, 0.2, 0.5]
 def __init__(self, lex_prob_file, corpus_file):
     self.lex_prob = defaultdict(list)
     for line in open(lex_prob_file):
         chunks = line[:-1].split()
         self.lex_prob[chunks[1]].append(float(chunks[2]))
     corpus = TextCorpus(input=corpus_file)
     self.corpus_freq = FreqDist([word for line in corpus.get_texts() for word in line])
     self.thresholds = [0.01, 0.05, 0.1, 0.2, 0.5]
Esempio n. 4
0
def pretrain():
    """pre train the text corpus and build the dictionary"""
    gutenberg_corpus = TextCorpus(text_corpus_file)
    gutenberg_corpus.dictionary.save(dict_file)
    gutenberg_corpus.dictionary.save_as_text(dic_txt_file)
    mm = MmCorpus.serialize(mm_corpus_file, gutenberg_corpus)
    print mm
Esempio n. 5
0
def train(text_corpus_file, dict_file):
    """train lsi model from text corpus"""
    gutenberg_corpus = TextCorpus(text_corpus_file)
    dict = Dictionary.load(dict_file)
    lsi = LsiModel(corpus=gutenberg_corpus, id2word=dict, num_topics=400)
    lsi.save(model_file)
    print lsi.projection.u
    print lsi.projection.u.size
    print lsi.projection.u[0].size
Esempio n. 6
0
def train_gensim():
	from gensim.corpora import TextCorpus
	from gensim.corpora.textcorpus import lower_to_unicode
	from gensim.models import Word2Vec as GensimWord2Vec

	start = time()

	stopwords = []
	if args.stop_word_lang:
		# starting spark only for this...
		spark = SparkSession.builder.appName("load stop words").getOrCreate()
		stopwords += StopWordsRemover.loadDefaultStopWords(args.stop_word_lang)
		spark.sparkContext.stop()
	if args.stop_word_file:
		with open(args.stop_word_file) as stop_word_file:
			stopwords += [word.strip("\n") for word in stop_word_file.readlines()]

	def remove_stopwords(tokens):
		return [token for token in tokens if token not in stopwords]

	corpus = TextCorpus(
		args.txtPath,
		dictionary={None: None},
		character_filters=[lower_to_unicode],
		token_filters=[remove_stopwords]
	)

	model = GensimWord2Vec(
		seed=1,
		alpha=args.step_size,
		size=args.vector_size,
		window=args.window_size,
		sample=1e-6,
        sg=1
	)
	model.build_vocab(corpus.get_texts())
	model.train(corpus.get_texts(), total_examples=model.corpus_count, epochs=model.epochs)
	model.save(args.modelPath)

	end = time()
	print("Gensim training took {} seconds".format(end - start))
    sys.exit(1)
inp, model_name = sys.argv[1:3]

if len(sys.argv) > 3:
    ntopics = int(sys.argv[3])

if len(sys.argv) > 4:
    keep_words = int(sys.argv[4])
else:
    keep_words = DEFAULT_DICT_SIZE

if os.path.exists(outp +
                  '_wordids.txt.bz2') and os.path.exists(outp +
                                                         '_corpus.pkl.bz2'):
    dictionary = Dictionary.load_from_text(outp + '_wordids.txt.bz2')
    wiki = TextCorpus.load(outp + '_corpus.pkl.bz2')
else:
    wiki = TextCorpus(inp)
    # only keep the most frequent words
    wiki.dictionary.filter_extremes(no_below=20,
                                    no_above=0.1,
                                    keep_n=keep_words)
    wiki.dictionary.save_as_text(outp + '_wordids.txt.bz2')
    wiki.save(outp + '_corpus.pkl.bz2')
    # load back the id->word mapping directly from file
    # this seems to save more memory, compared to keeping the wiki.dictionary object from above
    dictionary = Dictionary.load_from_text(outp + '_wordids.txt.bz2')

# build tfidf
if os.path.exists(outp + '_tfidf.mm'):
    mm = gensim.corpora.MmCorpus(outp + '_tfidf.mm')
Esempio n. 8
0
__author__ = 'Marci'

import logging, sys, pprint
from gensim.corpora import TextCorpus, MmCorpus, Dictionary

# Set logging for gensim
logging.basicConfig(stream=sys.stdout, level=logging.INFO)

# gensim docs: "Provide a filename or a file-like object as input and TextCorpus will be initialized with a
# dictionary in `self.dictionary`and will support the `iter` corpus method. For other kinds of corpora, you only
# need to override `get_texts` and provide your own implementation."
background_corpus = TextCorpus(input=YOUR_CORPUS)

# Important -- save the dictionary generated by the corpus, or future operations will not be able to map results
# back to original words.
background_corpus.dictionary.save("my_dict.dict")

MmCorpus.serialize(
    "background_corpus.mm", background_corpus
)  #  Uses numpy to persist wiki corpus in Matrix Market format. File will be several GBs.

### Generating a large training/background corpus using Wikipedia
from gensim.corpora import WikiCorpus, wikicorpus

articles = "enwiki-latest-pages-articles.xml.bz2"  # available from http://en.wikipedia.org/wiki/Wikipedia:Database_download

# This will take many hours! Output is Wikipedia in bucket-of-words (BOW) sparse matrix.
wiki_corpus = WikiCorpus(articles)
wiki_corpus.dictionary.save("wiki_dict.dict")

MmCorpus.serialize("wiki_corpus.mm", wiki_corpus)  #  File will be several GBs.
 def __init__(self, corpus_file):
     corpus = TextCorpus(input=corpus_file)
     self.words = corpus.dictionary.values()
Esempio n. 10
0
from gensim.corpora import TextCorpus, MmCorpus, Dictionary
from gensim.models import TfidfModel
from gensim.models.ldamodel import LdaModel
from gensim.models.hdpmodel import HdpModel
import bz2

out = '/home/mjg/data/descriptions'

# Form corpus
corpus = TextCorpus(bz2.BZ2File(out + '.bz2'))

# remove common words
stoplist = set(
    'a,able,about,across,after,all,almost,also,am,among,an,and,any,are,as,at,be,because,been,but,by,can,cannot,could,dear,did,do,does,either,else,ever,every,for,from,get,got,had,has,have,he,her,hers,him,his,how,however,i,if,in,into,is,it,its,just,least,let,like,likely,may,me,might,most,must,my,neither,no,nor,not,of,off,often,on,only,or,other,our,own,rather,said,say,says,she,should,since,so,some,than,that,the,their,them,then,there,these,they,this,tis,to,too,twas,us,wants,was,we,were,what,when,where,which,while,who,whom,why,will,with,would,yet,you,your'
    .split(','))
stop_ids = [
    corpus.dictionary.token2id[stopword] for stopword in stoplist
    if stopword in corpus.dictionary.token2id
]
corpus.dictionary.filter_tokens(stop_ids)

# only keep the most frequent words
corpus.dictionary.filter_extremes(no_below=20, no_above=0.1, keep_n=100000)
# save stuff
MmCorpus.serialize(out + '_bow.mm', corpus, progress_cnt=10000)
corpus.dictionary.save_as_text(out + '_wordids.txt.bz2')
# save memory
dictionary = Dictionary.load_from_text(out + '_wordids.txt.bz2')
del corpus

# initialize corpus reader and word->id mapping
Esempio n. 11
0
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
# Copyright (C) 2017  Serge Sharoff
# This program is free software under GPL 3, see http://www.gnu.org/licenses/
'''
A script for inferring topics for documents with an existing model
'''
import sys
from gensim.models.ldamulticore import LdaMulticore
from gensim.corpora import Dictionary, TextCorpus

mname = sys.argv[1]
cname = sys.argv[2]

lda = LdaMulticore.load(mname)
dictionary = Dictionary.load_from_text(cname + '_wordids.txt.bz2')
wiki = TextCorpus.load(cname + '_corpus.pkl.bz2')

for d in wiki.get_texts():
    #bow = dictionary.doc2bow(d.split())
    t = lda.get_document_topics(dictionary.doc2bow(d))
    besttopval = 0
    for i in range(len(t)):
        topic = t[i]
        if topic[1] > besttopval:
            besttopval = topic[1]
            besttop = topic[0]
    print('%d %.3f' % (besttop, besttopval))
Esempio n. 12
0
	if wiki: # models will be trained on the Dutch Wikipedia corpus
		if os.path.exists(f_bow):
			corpus = WikiCorpus.load(f_bow)
		else:
			# download wikipedia training corpus (2015/10/14 18:45, 132MB)
			if not os.path.exists(f_corpus):
				wiki_lang, wiki_size, wiki_url = wikis[lang]
				if raw_input("About to download {0} Wikipedia corpus ({1}). Do you want to proceed? (y/n) ".format(wiki_lang, wiki_size)).startswith("y"):
					util.download_file(wiki_url, f_corpus, progress=True)
				else:
					sys.exit()
			corpus = WikiCorpus(f_corpus)
#			corpus.save(f_bow)
	else: # models will be trained on your own corpus
		if os.path.exists(f_bow):
			corpus = TextCorpus.load(f_bow)
		else:
			corpus = TextCorpus(f_corpus)
#			corpus.save(f_bow)

	# filter dictionary
	corpus.dictionary.filter_extremes(no_below=0, no_above=1, keep_n=voc_size)
	corpus.dictionary.save(f_dict)
	corpus.save(f_bow)

	# tf-idf model
	if os.path.exists(f_tfidf):
		tfidf = TfidfModel.load(f_tfidf)
	else:
		tfidf = TfidfModel(corpus, id2word=corpus.dictionary)
		tfidf.save(f_tfidf)
Esempio n. 13
0
	def __init__(self, input=None):
		TextCorpus.__init__(self, input)
Esempio n. 14
0
load_dotenv('./.env')

# logging
import logging
logger = logging.getLogger()
ch = logging.StreamHandler()
ch.setLevel(logging.INFO)
logger.addHandler(ch)

logger.info("Setting up app!")

app = Flask(__name__)
CORS(app)

# setup model stuff
corpus = TextCorpus('jobspicker/jobspicker-descriptions.csv')
corpus.dictionary.filter_extremes(no_below=4, no_above=.9, keep_n=100000)
sentences = [list(g) for g in list(corpus.get_texts())]
tfidf = TfidfModel(corpus)
model = Word2Vec.load("profiles.model")
corp_vecs = corpus_vec(sentences, model, corpus)

# create simple helper functions
get_vec = lambda t: sentence_to_vec(t, model, corpus, tfidf)
get_job = lambda v: get_closest_doc(v, corp_vecs, sentences)

# our database of bayesopt models
user_models = {}

@app.route('/init/<i>')
def init(i):