parser.add_argument( '-c', '--corpus', default=None, help="python pickle file containing tokens") parser.add_argument( '-s', '--sparse-matrix', default=None, help="sparse matrix file") parser.add_argument('-d', '--dictionary', default=None) parser.add_argument('-j', '--jobs', default=1) parser.add_argument('-m', '--metadata', default=None) parser.add_argument('-t', '--topics', default=10) args = parser.parse_args() print("loading corpus") corpus = load_sparse_corpus( sparse_matrix_file=args.sparse_matrix, documents_file=args.corpus, dictionary_file=args.dictionary, metadata_filename=args.metadata) print("calculating LDA of {0} topics".format(args.topics)) lda = ScikitLda( corpus=corpus, n_topics=int(args.topics), n_jobs=int(args.jobs)) fname = os.path.join(args.output_folder, 'lda_{0}.pkl'.format(args.topics)) print("writing to file: lda model {0}".format(fname)) lda.save(fname) fname = os.path.join( args.output_folder, 'lda_documents_{0}.csv'.format(args.topics)) print("writing to file: topics vs documents {0}".format(fname)) topic_document_matrix = lda.fit_transform()
# # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. import argparse from corpora import load_sparse_corpus if __name__ == '__main__': parser = argparse.ArgumentParser( description="load files of corpus and store sparse corpus") parser.add_argument( 'corpus', help="python pickle file, containing tokens and metadata") parser.add_argument('matrix') parser.add_argument('-d', '--dictionary', default=None) args = parser.parse_args() print("loading corpus") corpus = load_sparse_corpus( documents_file=args.corpus, dictionary_file=args.dictionary) print("writing matrix to file") corpus.save(sparse_matrix_file=args.matrix)