def main(): parser = argparse.ArgumentParser( description='accumulates values of same (document,author)-Pairs') parser.add_argument( '--raw-contribs', type=argparse.FileType('r'), help='path to input MatrixMarket raw contributions file (.mm/.mm.bz2)', required=True) parser.add_argument( '--acc-contribs', type=argparse.FileType('w'), help='path to output MatrixMarket accumulated contributions .mm file', required=True) args = parser.parse_args() input_raw_contribs_dump_path = args.raw_contribs.name output_acc_contribs_dump_path = args.acc_contribs.name logger.info('running with:\n{}'.format( pformat({ 'input_raw_contribs_dump_path': input_raw_contribs_dump_path, 'output_acc_contribs_dump_path': output_acc_contribs_dump_path }))) # lade, akkumulierte & speichere Beiträge raw_contribs = MmCorpus(input_raw_contribs_dump_path) acc_contribs = (accumulate(raw_doc_contribs) for raw_doc_contribs in raw_contribs) #MmCorpus.serialize(output_acc_contribs_dump_path, corpus=acc_contribs, progress_cnt=10000) MmWriter.write_corpus(output_acc_contribs_dump_path, corpus=acc_contribs, index=False, progress_cnt=10000, metadata=False)
def __init__(self, filename): """\ `filename` The name of the target file. """ mmw = MmWriter(filename) # write empty headers to the file (with enough space to be overwritten later) mmw.write_headers(-1, -1, -1) # will print 50 spaces followed by newline on the stats line self._mmw = mmw self._num_docs = -1 self._num_terms = 0 self._num_nnz = 0 # number of non-zeroes in the sparse corpus matrix
def main(args): cp = SafeConfigParser() cp.read(args.config) base_dir = os.path.join(cp.get('general', 'local_data_dir'), args.dir) hadoop_base_dir = os.path.join(cp.get('general', 'hadoop_data_dir'), args.dir) word2index_file = os.path.join(base_dir, cp.get('LDA', 'word2index')) blei_corpus_file = os.path.join(base_dir, cp.get('LDA', 'blei_corpus')) doc2topic_file = os.path.join(base_dir, cp.get('LDA', 'doc2topic')) dictionary = gensim.corpora.dictionary.Dictionary() id2Token = dict(enumerate(l[:-1] for l in open(word2index_file))) dictionary.token2id = {v: k for k, v in id2Token.items()} corpus = gensim.corpora.bleicorpus.BleiCorpus(blei_corpus_file, fname_vocab=word2index_file) time1 = time.time() model = gensim.models.ldamulticore.LdaMulticore(corpus=corpus,\ num_topics=args.dim,\ id2word=dictionary,\ workers=8,\ chunksize=10000,\ passes=1,\ batch=False,\ alpha='symmetric',\ eta=None,\ decay=0.5,\ offset=1.0,\ eval_every=10,\ iterations=50,\ gamma_threshold=0.001) time2 = time.time() print 'training lda model took %0.3f minutes' % ((time2-time1) / 60.0) model.save(os.path.join(base_dir, 'lda_model')) time1 = time.time() matrix = model[corpus] MmWriter.write_corpus(doc2topic_file, matrix) time2 = time.time() print 'creating lda vectors took %0.3f minutes' % ((time2-time1) / 60.0)
def main(args): cp = SafeConfigParser() cp.read(args.config) base_dir = os.path.join(cp.get('DEFAULT', 'data_path'), args.lang) hadoop_base_dir = os.path.join(cp.get('DEFAULT', 'hadoop_data_path'), args.lang) word2index_file = os.path.join(base_dir, cp.get('recommendation', 'word2index')) blei_corpus_file = os.path.join(base_dir, cp.get('recommendation', 'blei_corpus')) doc2topic_file = os.path.join(base_dir, cp.get('recommendation', 'doc2topic')) dictionary = gensim.corpora.dictionary.Dictionary() id2Token = dict(enumerate(l[:-1] for l in open(word2index_file))) dictionary.token2id = {v: k for k, v in id2Token.items()} corpus = gensim.corpora.bleicorpus.BleiCorpus(blei_corpus_file, fname_vocab=word2index_file) time1 = time.time() model = gensim.models.ldamulticore.LdaMulticore(corpus=corpus,\ num_topics=args.dim,\ id2word=dictionary,\ workers=8,\ chunksize=10000,\ passes=1,\ batch=False,\ alpha='symmetric',\ eta=None,\ decay=0.5,\ offset=1.0,\ eval_every=10,\ iterations=50,\ gamma_threshold=0.001) time2 = time.time() logger.info ('training lda model took %0.3f minutes' % ((time2-time1) / 60.0)) model.save(os.path.join(base_dir, 'lda_model')) time1 = time.time() matrix = model[corpus] MmWriter.write_corpus(doc2topic_file, matrix) time2 = time.time() logger.info ('creating lda vectors took %0.3f minutes' % ((time2-time1) / 60.0))
def main(): parser = argparse.ArgumentParser(description='prunes contribs of a given author-document-contribs file, storing only top N max. contributions per authot') parser.add_argument('--author-doc-contribs', type=argparse.FileType('r'), help='path to input contribution MatrixMarket file (.mm/.mm.bz2)', required=True) parser.add_argument('--pruned-contribs', type=argparse.FileType('w'), help='path to output MatrixMarket .mm file', required=True) parser.add_argument('--top-n-contribs', type=int, help='keep only N contribs with highes values per author', required=True) args = parser.parse_args() input_author_doc_contribs_path = args.author_doc_contribs.name output_pruned_contribs_path = args.pruned_contribs.name top_n_contribs = args.top_n_contribs logger.info('running with:\n{}'.format(pformat({'input_author_doc_contribs_path':input_author_doc_contribs_path, 'output_pruned_contribs_path':output_pruned_contribs_path, 'top_n_contribs':top_n_contribs}))) contribs = MmCorpus(input_author_doc_contribs_path) num_authors = contribs.num_docs num_docs = contribs.num_terms logger.info('processing contributions of {} authors, {} docs'.format(num_authors, num_docs)) pruned_contribs = prune_contribs_of_authors(contribs, top_n_contribs) logger.info('writing pruned corpus') MmWriter.write_corpus(output_pruned_contribs_path, pruned_contribs, num_terms=num_docs, index=False, progress_cnt=10000, metadata=False)
def main(): parser = argparse.ArgumentParser(description='creates an id2author mapping gensim dictionary a document->authorid contributions MatrixMarket file and a binary article title file from a given WikiMedia *-pages-meta-history dump (considering only articles in mainspace!)') parser.add_argument('--history-dump', type=argparse.FileType('r'), help='path to input WikiMedia *-pages-meta-history file (.xml/.xml.bz2)', required=True) parser.add_argument('--id2author', type=argparse.FileType('w'), help='path to output text id2author dictionary (.txt/.txt.bz2)', required=True) parser.add_argument('--contribs', type=argparse.FileType('w'), help='path to output MatrixMarket contributions .mm file; also creates a binary article title file CONTRIBS.metadata.cpickle', required=True) parser.add_argument('--contribution-value', choices=CONTRIBUTION_VALUE_FUNCTIONS, help='calculated per-contribution value; choices: {}'.format(CONTRIBUTION_VALUE_FUNCTIONS.keys()), required=True) parser.add_argument("--namespace-prefixes", type=argparse.FileType('r'), help='file of namespace prefixes to ignore') args = parser.parse_args() args = parser.parse_args() input_history_dump_path = args.history_dump.name output_id2author_path = args.id2author.name output_contribs_path = args.contribs.name contribution_value = args.contribution_value namespace_prefixes = read_lines(args.namespace_prefixes.name) if args.namespace_prefixes else () logger.info('running with:\n{}'.format(pformat({'input_history_dump_path':input_history_dump_path, 'output_id2author_path':output_id2author_path, 'output_contribs_path':output_contribs_path, 'contribution_value':contribution_value, 'namespace_prefixes':namespace_prefixes}))) # konstruiere id2author-Dictionary: mappt Autornamen von registrierten, Nicht-Bot-Autoren auf IDs und umgekehrt with smart_open(input_history_dump_path) as history_dump_file: logger.info('generating author->id mappings') history_dump = xml_dump.Iterator.from_file(history_dump_file) # benutze id2word-Dictionary von gensim als id2author-Dictionary: Autoren entsprechen Termen id2author = Dictionary(get_revision_authors_of_pages(history_dump, namespace_prefixes)) logger.info('found {} different authors'.format(len(id2author))) logger.info('removing non-registered authors') remove_from_dictionary(id2author, is_registered_user) logger.info('reduced to {} registered authors'.format(len(id2author))) logger.info('removing bots') remove_from_dictionary(id2author, is_not_bot_user) logger.info('reduced to {} registered non-bot authors'.format(len(id2author))) id2author.compactify() id2author.save_as_text(output_id2author_path) # berechne & speichere Einträge (Autor-ID, Versionswert) Versionen gültiger Autoren für alle Artikel with smart_open(input_history_dump_path) as history_dump_file: logger.info('generating MatrixMarket representation per revision: (docid, authorid, value of revision)') history_dump = xml_dump.Iterator.from_file(history_dump_file) revision_value_fun = CONTRIBUTION_VALUE_FUNCTIONS[contribution_value] doc_auth_contribs = MetadataCorpus(get_revision_values(get_revisions_of_pages(history_dump, namespace_prefixes), id2author, revision_value_fun)) MmWriter.write_corpus(output_contribs_path, corpus=doc_auth_contribs, num_terms=len(id2author), index=False, progress_cnt=10000, metadata=True)
def main(): parser = argparse.ArgumentParser(description=description) parser.add_argument("--articles-dump", type=argparse.FileType('r'), help='path to input .xml.bz2 articles dump', required=True) parser.add_argument("--out-prefix", help='prefix of the generated output files', required=True) parser.add_argument( "--keep-words", type=int, help='number of most frequent word types to keep (default {})', required=True) parser.add_argument( "--no-below", type=int, help= 'Keep only tokes which appear in at least NO_BELOW documents (default {})', required=True) parser.add_argument( "--no-above", type=float, help= 'Keep only tokes which appear in at most NO_ABOVE*CORPUSSIZE documents (default {})', required=True) parser.add_argument( "--article-min-tokens", type=int, help= 'Analyze only articles of >= ARTICLE_MIN_TOKENS tokens default {}). Should be >=1', required=True) parser.add_argument("--token-min-len", type=int, help='Consider only tokens of at least MIN chars', required=True) parser.add_argument('--remove-stopwords', action='store_true', help='remove english stopwords with gensims stoplist') parser.add_argument("--namespace-prefixes", type=argparse.FileType('r'), help='file of namespace prefixes to ignore') args = parser.parse_args() input_articles_path = args.articles_dump.name output_prefix = args.out_prefix keep_words = args.keep_words no_below, no_above = args.no_below, args.no_above article_min_tokens = args.article_min_tokens token_min_len = args.token_min_len remove_stopwords = args.remove_stopwords namespace_prefixes = read_lines( args.namespace_prefixes.name) if args.namespace_prefixes else () logger.info('running with:\n{}'.format( pformat({ 'input_articles_path': input_articles_path, 'output_prefix': output_prefix, 'keep_words': keep_words, 'no_below': no_below, 'no_above': no_above, 'article_min_tokens': article_min_tokens, 'token_min_len': token_min_len, 'remove_stopwords': remove_stopwords, 'namespace_prefixes': namespace_prefixes }))) # erzeuge & und speichere Vokabular logger.info('generating vocabulary') stopwords = STOPWORDS if remove_stopwords else () corpus = MediaWikiCorpus(input_articles_path, article_min_tokens, token_min_len, stopwords, namespace_prefixes) corpus.dictionary = Dictionary(corpus.get_texts()) logger.info( 'filtering dictionary: removing terms in less than {} docs'.format( no_below)) corpus.dictionary.filter_extremes(no_below=no_below, no_above=1, keep_n=keep_words) logger.info( 'filtering dictionary: removing terms in more than {} of all docs'. format(no_above)) corpus.dictionary.filter_extremes(no_below=0, no_above=no_above, keep_n=keep_words) corpus.dictionary.compactify() output_id2word_path = output_prefix + '-id2word.txt' corpus.dictionary.save_as_text(output_id2word_path) # erzeuge & speichere BOW-Modell aus Vokabular logger.info('generating bag of words corpus') corpus.metadata = True output_corpus_path = output_prefix + '.mm' #MmCorpus.serialize(output_corpus_path, corpus, progress_cnt=10000, metadata=True) MmWriter.write_corpus(output_corpus_path, corpus=corpus, index=False, progress_cnt=10000, metadata=True)