def process_records(records, fields, target, textmodel=None): tokenize = CountVectorizer().build_analyzer() input = None X = None y_labels = [] for i, record in enumerate(records): nums = [] strs = [] y_labels.append(record.get(target)) for field in fields: if is_number(record.get(field)): nums.append(record[field]) else: strs.append(str(record.get(field) or "").lower()) if strs: if input is None: input = StringIO.StringIO() print >> input, " ".join(tokenize(" ".join(strs))) if nums: if X is None: X = sp.lil_matrix((len(records),len(nums))) X[i] = np.array(nums, dtype=np.float64) if input is not None: if X is not None: X_2 = X.tocsr() else: X_2 = None if isinstance(textmodel,basestring): if textmodel == 'lsi': corpus = TextCorpus(input) textmodel = LsiModel(corpus, chunksize=1000) elif textmodel == 'tfidf': corpus = TextCorpus(input) textmodel = TfidfModel(corpus) elif textmodel == 'hashing': textmodel = None hasher = FeatureHasher(n_features=2 ** 18, input_type="string") input.seek(0) X = hasher.transform(tokenize(line.strip()) for line in input) if textmodel: num_terms = len(textmodel.id2word or getattr(textmodel, 'dfs',[])) X = corpus2csc(textmodel[corpus], num_terms).transpose() if X_2 is not None: # print >> sys.stderr, "X SHAPE:", X.shape # print >> sys.stderr, "X_2 SHAPE:", X_2.shape X = sp.hstack([X, X_2], format='csr') elif X is not None: textmodel = None X = X.tocsr() print >> sys.stderr, "X SHAPE:", X.shape return X, y_labels, textmodel
def __init__(self, lex_prob_file, corpus_file): self.lex_prob = defaultdict(list) for line in open(lex_prob_file): chunks = line[:-1].split() self.lex_prob[chunks[1]].append(float(chunks[2])) corpus = TextCorpus(input=corpus_file) self.corpus_freq = FreqDist([word for line in corpus.get_texts() for word in line]) self.thresholds = [0.01, 0.05, 0.1, 0.2, 0.5]
def pretrain(): """pre train the text corpus and build the dictionary""" gutenberg_corpus = TextCorpus(text_corpus_file) gutenberg_corpus.dictionary.save(dict_file) gutenberg_corpus.dictionary.save_as_text(dic_txt_file) mm = MmCorpus.serialize(mm_corpus_file, gutenberg_corpus) print mm
def train(text_corpus_file, dict_file): """train lsi model from text corpus""" gutenberg_corpus = TextCorpus(text_corpus_file) dict = Dictionary.load(dict_file) lsi = LsiModel(corpus=gutenberg_corpus, id2word=dict, num_topics=400) lsi.save(model_file) print lsi.projection.u print lsi.projection.u.size print lsi.projection.u[0].size
def train_gensim(): from gensim.corpora import TextCorpus from gensim.corpora.textcorpus import lower_to_unicode from gensim.models import Word2Vec as GensimWord2Vec start = time() stopwords = [] if args.stop_word_lang: # starting spark only for this... spark = SparkSession.builder.appName("load stop words").getOrCreate() stopwords += StopWordsRemover.loadDefaultStopWords(args.stop_word_lang) spark.sparkContext.stop() if args.stop_word_file: with open(args.stop_word_file) as stop_word_file: stopwords += [word.strip("\n") for word in stop_word_file.readlines()] def remove_stopwords(tokens): return [token for token in tokens if token not in stopwords] corpus = TextCorpus( args.txtPath, dictionary={None: None}, character_filters=[lower_to_unicode], token_filters=[remove_stopwords] ) model = GensimWord2Vec( seed=1, alpha=args.step_size, size=args.vector_size, window=args.window_size, sample=1e-6, sg=1 ) model.build_vocab(corpus.get_texts()) model.train(corpus.get_texts(), total_examples=model.corpus_count, epochs=model.epochs) model.save(args.modelPath) end = time() print("Gensim training took {} seconds".format(end - start))
sys.exit(1) inp, model_name = sys.argv[1:3] if len(sys.argv) > 3: ntopics = int(sys.argv[3]) if len(sys.argv) > 4: keep_words = int(sys.argv[4]) else: keep_words = DEFAULT_DICT_SIZE if os.path.exists(outp + '_wordids.txt.bz2') and os.path.exists(outp + '_corpus.pkl.bz2'): dictionary = Dictionary.load_from_text(outp + '_wordids.txt.bz2') wiki = TextCorpus.load(outp + '_corpus.pkl.bz2') else: wiki = TextCorpus(inp) # only keep the most frequent words wiki.dictionary.filter_extremes(no_below=20, no_above=0.1, keep_n=keep_words) wiki.dictionary.save_as_text(outp + '_wordids.txt.bz2') wiki.save(outp + '_corpus.pkl.bz2') # load back the id->word mapping directly from file # this seems to save more memory, compared to keeping the wiki.dictionary object from above dictionary = Dictionary.load_from_text(outp + '_wordids.txt.bz2') # build tfidf if os.path.exists(outp + '_tfidf.mm'): mm = gensim.corpora.MmCorpus(outp + '_tfidf.mm')
__author__ = 'Marci' import logging, sys, pprint from gensim.corpora import TextCorpus, MmCorpus, Dictionary # Set logging for gensim logging.basicConfig(stream=sys.stdout, level=logging.INFO) # gensim docs: "Provide a filename or a file-like object as input and TextCorpus will be initialized with a # dictionary in `self.dictionary`and will support the `iter` corpus method. For other kinds of corpora, you only # need to override `get_texts` and provide your own implementation." background_corpus = TextCorpus(input=YOUR_CORPUS) # Important -- save the dictionary generated by the corpus, or future operations will not be able to map results # back to original words. background_corpus.dictionary.save("my_dict.dict") MmCorpus.serialize( "background_corpus.mm", background_corpus ) # Uses numpy to persist wiki corpus in Matrix Market format. File will be several GBs. ### Generating a large training/background corpus using Wikipedia from gensim.corpora import WikiCorpus, wikicorpus articles = "enwiki-latest-pages-articles.xml.bz2" # available from http://en.wikipedia.org/wiki/Wikipedia:Database_download # This will take many hours! Output is Wikipedia in bucket-of-words (BOW) sparse matrix. wiki_corpus = WikiCorpus(articles) wiki_corpus.dictionary.save("wiki_dict.dict") MmCorpus.serialize("wiki_corpus.mm", wiki_corpus) # File will be several GBs.
def __init__(self, corpus_file): corpus = TextCorpus(input=corpus_file) self.words = corpus.dictionary.values()
from gensim.corpora import TextCorpus, MmCorpus, Dictionary from gensim.models import TfidfModel from gensim.models.ldamodel import LdaModel from gensim.models.hdpmodel import HdpModel import bz2 out = '/home/mjg/data/descriptions' # Form corpus corpus = TextCorpus(bz2.BZ2File(out + '.bz2')) # remove common words stoplist = set( 'a,able,about,across,after,all,almost,also,am,among,an,and,any,are,as,at,be,because,been,but,by,can,cannot,could,dear,did,do,does,either,else,ever,every,for,from,get,got,had,has,have,he,her,hers,him,his,how,however,i,if,in,into,is,it,its,just,least,let,like,likely,may,me,might,most,must,my,neither,no,nor,not,of,off,often,on,only,or,other,our,own,rather,said,say,says,she,should,since,so,some,than,that,the,their,them,then,there,these,they,this,tis,to,too,twas,us,wants,was,we,were,what,when,where,which,while,who,whom,why,will,with,would,yet,you,your' .split(',')) stop_ids = [ corpus.dictionary.token2id[stopword] for stopword in stoplist if stopword in corpus.dictionary.token2id ] corpus.dictionary.filter_tokens(stop_ids) # only keep the most frequent words corpus.dictionary.filter_extremes(no_below=20, no_above=0.1, keep_n=100000) # save stuff MmCorpus.serialize(out + '_bow.mm', corpus, progress_cnt=10000) corpus.dictionary.save_as_text(out + '_wordids.txt.bz2') # save memory dictionary = Dictionary.load_from_text(out + '_wordids.txt.bz2') del corpus # initialize corpus reader and word->id mapping
#!/usr/bin/env python3 # -*- coding: utf-8 -*- # Copyright (C) 2017 Serge Sharoff # This program is free software under GPL 3, see http://www.gnu.org/licenses/ ''' A script for inferring topics for documents with an existing model ''' import sys from gensim.models.ldamulticore import LdaMulticore from gensim.corpora import Dictionary, TextCorpus mname = sys.argv[1] cname = sys.argv[2] lda = LdaMulticore.load(mname) dictionary = Dictionary.load_from_text(cname + '_wordids.txt.bz2') wiki = TextCorpus.load(cname + '_corpus.pkl.bz2') for d in wiki.get_texts(): #bow = dictionary.doc2bow(d.split()) t = lda.get_document_topics(dictionary.doc2bow(d)) besttopval = 0 for i in range(len(t)): topic = t[i] if topic[1] > besttopval: besttopval = topic[1] besttop = topic[0] print('%d %.3f' % (besttop, besttopval))
if wiki: # models will be trained on the Dutch Wikipedia corpus if os.path.exists(f_bow): corpus = WikiCorpus.load(f_bow) else: # download wikipedia training corpus (2015/10/14 18:45, 132MB) if not os.path.exists(f_corpus): wiki_lang, wiki_size, wiki_url = wikis[lang] if raw_input("About to download {0} Wikipedia corpus ({1}). Do you want to proceed? (y/n) ".format(wiki_lang, wiki_size)).startswith("y"): util.download_file(wiki_url, f_corpus, progress=True) else: sys.exit() corpus = WikiCorpus(f_corpus) # corpus.save(f_bow) else: # models will be trained on your own corpus if os.path.exists(f_bow): corpus = TextCorpus.load(f_bow) else: corpus = TextCorpus(f_corpus) # corpus.save(f_bow) # filter dictionary corpus.dictionary.filter_extremes(no_below=0, no_above=1, keep_n=voc_size) corpus.dictionary.save(f_dict) corpus.save(f_bow) # tf-idf model if os.path.exists(f_tfidf): tfidf = TfidfModel.load(f_tfidf) else: tfidf = TfidfModel(corpus, id2word=corpus.dictionary) tfidf.save(f_tfidf)
def __init__(self, input=None): TextCorpus.__init__(self, input)
load_dotenv('./.env') # logging import logging logger = logging.getLogger() ch = logging.StreamHandler() ch.setLevel(logging.INFO) logger.addHandler(ch) logger.info("Setting up app!") app = Flask(__name__) CORS(app) # setup model stuff corpus = TextCorpus('jobspicker/jobspicker-descriptions.csv') corpus.dictionary.filter_extremes(no_below=4, no_above=.9, keep_n=100000) sentences = [list(g) for g in list(corpus.get_texts())] tfidf = TfidfModel(corpus) model = Word2Vec.load("profiles.model") corp_vecs = corpus_vec(sentences, model, corpus) # create simple helper functions get_vec = lambda t: sentence_to_vec(t, model, corpus, tfidf) get_job = lambda v: get_closest_doc(v, corp_vecs, sentences) # our database of bayesopt models user_models = {} @app.route('/init/<i>') def init(i):