Esempi in Python per MmCorpus.serialize, esempi in Python per gensim.corpora.mmcorpus.MmCorpus.serialize

Esempio n. 1

0

Mostra file

File: lda.py Progetto: belieberly/judicial_doc_measurement

def lda():
    data = pd.read_csv("./lda_model/process.csv", encoding="utf-8", header=None)
    # data[2] = data[1].apply(lambda x: re.split(r'\s*', x))
    data[2] = data[1].apply(lambda x: x.split(' '))

    corpora_documents = []
    # print(data[2])
    for item_str in data[2]:
        # print(item_str)
        corpora_documents.append(item_str)
    del corpora_documents[0]

    print(corpora_documents[0])

    dict_1 = corpora.Dictionary(corpora_documents)
    dict_1.save('./lda_model/dict_v2')
    dict_corpora = [dict_1.doc2bow(i) for i in corpora_documents]
    print('字典构建完成')

    # 向量的每一个元素代表了一个word在这篇文档中出现的次数
    # print(corpus)
    from gensim.corpora.mmcorpus import MmCorpus

    MmCorpus.serialize('ths_corpora.mm', dict_corpora)

    tfidf = models.TfidfModel(dict_corpora)
    corpus_tfidf = tfidf[dict_corpora]
    tfidf.save("./lda_model/my_model.tfidf")
    np.random.seed(SOME_FIXED_SEED)
    lda = models.LdaModel(corpus_tfidf, num_topics=78, id2word=dict_1, iterations=1000)
    # # #
    lda.save('./lda_model/mylda_v2')
    lda.show_topics()

Esempio n. 2

0

Mostra file

File: defaultcorpus.py Progetto: Tooa/cablemap

def create_corpus(src, out_dir, no_below=20, keep_words=_DEFAULT_KEEP_WORDS):
    """\

    """
    wordid_filename = os.path.join(out_dir, 'cables_wordids.pickle')
    bow_filename = os.path.join(out_dir, 'cables_bow.mm')
    tfidf_filename = os.path.join(out_dir, 'cables_tfidf.mm')
    predicate = None # Could be set to something like pred.origin_filter(pred.origin_germany)
    # 1. Create word dict
    dct = Dictionary()
    dct_handler = DictionaryHandler(dct)
    handler = create_filter(dct_handler)
    handle_source(src, handler, predicate)
    dct.filter_extremes(no_below=no_below, no_above=0.1, keep_n=keep_words)
    dct.save(wordid_filename)
    # 2. Reiterate through the cables and create the vector space
    corpus_handler = CorpusHandler(out_dir, dct=dct, allow_dict_updates=False)
    handler = create_filter(corpus_handler)
    handle_source(src, handler, predicate)
    # 3. Load corpus
    mm = MmCorpus(bow_filename)
    # 4. Create TF-IDF model
    tfidf = TfidfModel(mm, id2word=dct, normalize=True)
    # 5. Save the TF-IDF model
    MmCorpus.serialize(tfidf_filename, tfidf[mm], progress_cnt=10000)

Esempio n. 3

0

Mostra file

def create_corpus(src, out_dir, no_below=20, keep_words=_DEFAULT_KEEP_WORDS):
    """\

    """
    wordid_filename = os.path.join(out_dir, 'cables_wordids.pickle')
    bow_filename = os.path.join(out_dir, 'cables_bow.mm')
    tfidf_filename = os.path.join(out_dir, 'cables_tfidf.mm')
    predicate = None  # Could be set to something like pred.origin_filter(pred.origin_germany)
    # 1. Create word dict
    dct = Dictionary()
    dct_handler = DictionaryHandler(dct)
    handler = create_filter(dct_handler)
    handle_source(src, handler, predicate)
    dct.filter_extremes(no_below=no_below, no_above=0.1, keep_n=keep_words)
    dct.save(wordid_filename)
    # 2. Reiterate through the cables and create the vector space
    corpus_handler = CorpusHandler(out_dir, dct=dct, allow_dict_updates=False)
    handler = create_filter(corpus_handler)
    handle_source(src, handler, predicate)
    # 3. Load corpus
    mm = MmCorpus(bow_filename)
    # 4. Create TF-IDF model
    tfidf = TfidfModel(mm, id2word=dct, normalize=True)
    # 5. Save the TF-IDF model
    MmCorpus.serialize(tfidf_filename, tfidf[mm], progress_cnt=10000)

Esempio n. 4

0

Mostra file

File: graph_tools.py Progetto: cosbynator/karma-prediction-cs224w

 def save_corpus(cls, tokens_file, corpus_file, dictionary_path):
     print "Instantiating corpus"
     corpus = UserCorpus(tokens_file)
     print "Filtering extremes"
     corpus.dictionary.filter_extremes(no_below=20, no_above=0.1, keep_n=100000)
     print "Serializing corpus"
     MmCorpus.serialize(corpus_file, corpus, progress_cnt=10000)
     print "Serializing dictionary"
     corpus.dictionary.save_as_text(dictionary_path)

Esempio n. 5

0

Mostra file

File: linear_msda.py Progetto: clur/mSDA

    def train(self, corpus, chunksize=10000, use_temp_files=True):
        """
        train the underlying linear mappings.

        @param corpus is a gensim corpus compatible format
        @param use_temp_files determines whether to use temporary files to store the intermediate representations of
        the corpus to train the next layer. Setting flag True will not greatly affect memory usage, but will temporarily
        require a significant amount of disk space. Using temp files will strongly speed up training, especially as the
        number of layers increases.
        """
        ln.info("Training mSDA with %s layers. ")
        if not use_temp_files:
            ln.warn("Training without temporary files. May take a long time!")
            self.reduction_layer.train(corpus, chunksize=chunksize)
            current_representation = self.reduction_layer[corpus]
            for layer_num, layer in enumerate(self.mda_layers):

                # We feed the corpus through all intermediate layers to get the current representation
                # that representation is then used to train the next layer
                # this is memory-independent, but will probably be very slow.

                ln.info("Training layer %s.", layer_num)
                layer.train(current_representation, chunksize=chunksize)
                if layer_num < len(self.mda_layers) - 1:
                    current_representation = layer[current_representation]

        else:
            ln.info("Using temporary files to speed up training.")

            ln.info("Beginning training on %s layers." %
                    (len(self.mda_layers) + 1))
            self.reduction_layer.train(corpus, chunksize=chunksize)

            # serializing intermediate representation
            MmCorpus.serialize(".msda_intermediate.mm",
                               self.reduction_layer[corpus],
                               progress_cnt=chunksize)

            # load corpus to train next layer
            current_representation = MmCorpus(".msda_intermediate.mm")

            for layer_num, layer in enumerate(self.mda_layers):
                layer.train(current_representation, chunksize=chunksize)
                os.remove(".msda_intermediate.mm")
                os.remove(".msda_intermediate.mm.index")

                if layer_num < len(self.mda_layers) - 1:
                    MmCorpus.serialize(".msda_intermediate.mm",
                                       layer[current_representation],
                                       progress_cnt=chunksize)
                    current_representation = MmCorpus(".msda_intermediate.mm")

        ln.info("mSDA finished training.")

Esempio n. 6

0

Mostra file

File: tfidf_vectorizer.py Progetto: universvm/MediSia

def convert_corpus_to_sparse_tfidf(
    metadata_index_outpath: Path,
    vectorized_corpus_outpath: Path,
    path_to_jsonl_index: Path = BIOPAPERS_JSON_PATH,
    path_to_bow: Path = BOW_PATH,
    tfidf_vectorizer: Path = TFIDF_VECTORIZER,
):
    """
    Convert corpora of a specific category into a tfidf sparse matrix.

    It saves:
        1. MM Corpus sparse matrix indexed by id
        2. metadata index as gz pickle file.

    Parameters
    ----------
    metadata_index_outpath: Path
        Path to metadata index (without extension)
    vectorized_corpus_outpath: Path
        Path to corpus matrix, does not require extension
    path_to_jsonl_index: Path
        Path to jsonl_index for that specific category
    path_to_bow: Path
        Path to BOW dictionary
    tfidf_vectorizer:
        Path to TFIDF model for vectorization of BOW corpus
    """
    # Load dictionary
    if path_to_bow.exists():
        bow_dictionary = Dictionary.load(str(path_to_bow))
    else:
        bow_dictionary = create_bow_from_biopapers()
    # Load tfidf model:
    if tfidf_vectorizer.exists():
        tfidf_model = TfidfModel.load(str(tfidf_vectorizer))
    else:
        tfidf_model = create_tfidf_from_papers()
    # Add pickle suffix:
    metadata_index_outpath = metadata_index_outpath.with_suffix(".jsonl")
    # Load corpus generator:
    tfidf_corpus = BiopapersCorpus(
        bow_dictionary=bow_dictionary,
        path_to_JSONL_index=path_to_jsonl_index,
        tfidf_vectorizer=tfidf_model,
        metadata_index_outpath=metadata_index_outpath,
    )
    # Save corpus and index to file:
    MmCorpus.serialize(str(vectorized_corpus_outpath), tfidf_corpus)

Esempio n. 7

0

Mostra file

File: 001_create_gensim_data.py Progetto: wpli/latentmodels

def main(training_datafile, output_path):	
	logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)
	logger = logging.getLogger('Archive.gensim')
	filters = [strip_punctuation, strip_multiple_whitespaces, strip_numeric, remove_stopwords, strip_short]
	logger.info('Creating Corpus object...')
	corpus = ArchiveCorpus(datafile=training_datafile, preprocess=filters)
	filename = ''.join(training_datafile.split('/')[-1])

	if not os.path.exists(output_path):
		os.makedirs(output_path)

	outfile_path = os.path.join(output_path, filename)
	logger.info('Saving corpus to disk: {}.mm'.format(filename))
	MmCorpus.serialize('{}.mm'.format(outfile_path), corpus, progress_cnt=1000)
	logger.info('Saving dictionary to disk: {}.dict'.format(filename))
	corpus.dictionary.save('{}.dict'.format(outfile_path))

Esempio n. 8

0

Mostra file

def create_corpus(column, shortname, type_n):
    my_corpus = Corpus_Column(fname, column)
    dicti = my_corpus.dictionary
#shortname1 has only freq < 5 removed others have filter_extremes
    once_ids = [tokenid for tokenid, docfreq in dicti.dfs.iteritems() if docfreq < 5]
    dicti.filter_tokens(bad_ids=once_ids)
    #dicti.filter_extremes()
    dicti.compactify()
    dicti.save(path_join(cache_dir, "%s_%s_nltk_filtered_dic.pickle" % (type_n, shortname)))
    MmCorpus.serialize(path_join(cache_dir, "%s_%s_nltk_filtered.corpus.mtx" % (type_n, shortname)), my_corpus, id2word=dicti)
    print my_corpus.dictionary
    print "50 most used in %s" % column
    i = 0
    for k, v in sorted(dicti.dfs.items(), key=operator.itemgetter(1), reverse=True):
        if i < 50:
            print dicti[k], v
            i = i + 1

Esempio n. 9

0

Mostra file

File: createTopicModel.py Progetto: wjq-learning/Knowledge-Base-integration-with-Text-Classification-pipeline

    
    def __iter__(self):
        self.titles = []
        for title, tokens in itertools.islice(iter_wiki(self.dump_file), self.clip_docs):
            self.titles.append(title)
            yield self.dictionary.doc2bow(tokens)
    
    def __len__(self):
        return self.clip_docs

# create a stream of bag-of-words vectors
wiki_corpus = WikiCorpus(fileLocation+'enwiki-latest-pages-articles.xml.bz2', id2word_wiki)
vector = next(iter(wiki_corpus))
print(vector)  # print the first vector in the stream

MmCorpus.serialize(fileLocation+'wikiModels/wiki_bow.mm', wiki_corpus)

mm_corpus = MmCorpus(fileLocation+'wikiModels/wiki_bow.mm')
print(mm_corpus)

clipped_corpus = gensim.utils.ClippedCorpus(mm_corpus, 4000)
lda_model = gensim.models.LdaModel(clipped_corpus, num_topics=10, id2word=id2word_wiki, passes=4)

# store all trained models to disk
lda_model.save(fileLocation+'wikiModels/lda_wiki.model')
#lsi_model.save('./data/lsi_wiki.model')
#tfidf_model.save('./data/tfidf_wiki.model')
id2word_wiki.save(fileLocation+'wikiModels/wiki.dictionary')

loaded_lda_model = gensim.models.LdaModel.load(fileLocation+'wikiModels/lda_wiki.model')

Esempio n. 10

0

Mostra file

preprocessor = Preprocessor(use_stemming=False, dictionary=dictionary)

corpusname = "brown"
corpus = [
    preprocessor.preprocess(" ".join(text), return_bow=True)
    for text in brown.sents()
]
preprocessor.dictionary.filter_extremes(15, 0.1, 30000)
corpus = [
    preprocessor.preprocess(" ".join(text),
                            allow_update=False,
                            return_bow=True) for text in brown.sents()
]

ln.debug("saving/loading corpus")
save = MmCorpus.serialize("test.mm", corpus)
corpus = MmCorpus("test.mm")

dimensions = 2000
params = [{"num_layers": 5, "noise": 0.7}, {"num_layers": 3, "noise": 0.3}][0]

ln.info("training mSDA with %s dimensions. params: %s" % (dimensions, params))
model = mSDAWrapper.train(corpus, dimensions, dictionary, params)

paramstring = "_".join(["%s-%s" % (k, v) for k, v in params.items()])
savestring = "mSDA_%s_%s_" % (corpusname, paramstring)
model.save(savestring)
msda_wrapper = mSDAWrapper(savestring, preprocessor)


def get_synonyms(word):

Esempio n. 11

0

Mostra file

File: test_msda.py Progetto: phdowling/mSDA

    def save(self, fname):
        self.model.save(fname)


ln.info("preprocessing corpus")
dictionary = Dictionary()

preprocessor = Preprocessor(use_stemming=False, dictionary=dictionary)

corpusname = "brown"
corpus = [preprocessor.preprocess(" ".join(text), return_bow=True) for text in brown.sents()]
preprocessor.dictionary.filter_extremes(15, 0.1, 30000)
corpus = [preprocessor.preprocess(" ".join(text), allow_update=False, return_bow=True) for text in brown.sents()]

ln.debug("saving/loading corpus")
save = MmCorpus.serialize("test.mm", corpus)
corpus = MmCorpus("test.mm")


dimensions = 2000
params = [{"num_layers": 5, "noise": 0.7},
          {"num_layers": 3, "noise": 0.3}][0]

ln.info("training mSDA with %s dimensions. params: %s" % (dimensions, params))
model = mSDAWrapper.train(corpus, dimensions, dictionary, params)

paramstring = "_".join(["%s-%s" % (k, v) for k, v in params.items()])
savestring = "mSDA_%s_%s_" % (corpusname, paramstring)
model.save(savestring)
msda_wrapper = mSDAWrapper(savestring, preprocessor)

Esempio n. 12

0

Mostra file

File: wikicorpus.py Progetto: hjanime/gensim

    if len(sys.argv) < 3:
        print globals()["__doc__"] % locals()
        sys.exit(1)
    input, output = sys.argv[1:3]
    if len(sys.argv) > 3:
        keep_words = int(sys.argv[3])
    else:
        keep_words = DEFAULT_DICT_SIZE

    # build dictionary. only keep 100k most frequent words (out of total ~8.2m unique tokens)
    # takes about 9h on a macbook pro, for 3.5m articles (june 2011 wiki dump)
    wiki = WikiCorpus(input, keep_words=keep_words)
    # save dictionary and bag-of-words (term-document frequency matrix)
    # another ~9h
    wiki.dictionary.save_as_text(output + "_wordids.txt")
    MmCorpus.serialize(output + "_bow.mm", wiki, progress_cnt=10000)
    del wiki

    # initialize corpus reader and word->id mapping
    id2token = Dictionary.load_from_text(output + "_wordids.txt")
    mm = MmCorpus(output + "_bow.mm")

    # build tfidf,
    # ~30min
    from gensim.models import TfidfModel

    tfidf = TfidfModel(mm, id2word=id2token, normalize=True)

    # save tfidf vectors in matrix market format
    # ~2h; result file is 15GB! bzip2'ed down to 4.5GB
    MmCorpus.serialize(output + "_tfidf.mm", tfidf[mm], progress_cnt=10000)

Esempio n. 13

0

Mostra file

File: prepare_corpus_tfidf.py Progetto: syhw/contextual_word_segmentation

        print "we will lemmatize ('you were'->'be/VB')"
        mname = prefix + '_lemmatized_tfidf'
    else:
        print "you don't have pattern: we will tokenize ('you were'->'you','were')"
        mname = prefix + '_tokenized_tfidf'

    try:
        id2token = Dictionary.load_from_text(mname + '_wordids.txt')
        mm = MmCorpus(mname + '_bow.mm')
        print ">>> Loaded corpus from serialized files"
    except:
        print ">>> Extracting articles..."
        corpus = CDS_Corpus(FOLDER)
        corpus.dictionary.save_as_text(mname + '_wordids.txt')
        print ">>> Saved dictionary as " + mname + "_wordids.txt"
        MmCorpus.serialize(mname + '_bow.mm', corpus, progress_cnt=1000)
        print ">>> Saved MM corpus as " + mname + "_bow.mm"
        id2token = Dictionary.load_from_text(mname + '_wordids.txt')
        mm = MmCorpus(mname + '_bow.mm')
        del corpus

    print ">>> Using TF-IDF"
    tfidf = models.TfidfModel(mm, id2word=id2token, normalize=True)
    corpus_tfidf = tfidf[mm]

    lda = models.ldamodel.LdaModel(corpus=corpus_tfidf, id2word=id2token, 
            num_topics=N_TOPICS, alpha='auto',
            update_every=1, chunksize=800, passes=50)

    f = open(mname + '.ldamodel', 'w')
    cPickle.dump(lda, f)

Esempio n. 14

0

Mostra file

File: lsi.py Progetto: Sasafrass/IR1

    def __init__(self, docs, topic_number=500):
        # Create a dictionary representation of the documents.
        print('training LSI models with topic number = ' + str(topic_number))
        if (not os.path.isfile('./lsi/lsi_dict.dict')):
            print('creating dict')
            dictionary = Dictionary(docs)
            dictionary.save('./lsi/lsi_dict.dict')
        else:
            print('dict already exists')
            dictionary = Dictionary.load("./lsi/lsi_dict.dict")
        self.dictionary = dictionary

        # Create corpora
        if (not os.path.isfile('./lsi/lsi_corpus.mm')):
            # Filter out words that occur less than 20 documents, or more than 50% of the documents.
            print('creating bow corpus')
            dictionary.filter_extremes(no_below=20, no_above=0.5)
            corpus = [dictionary.doc2bow(doc) for doc in docs]
            MmCorpus.serialize("lsi/lsi_corpus.mm", corpus)
        else:
            print('bow corpus already exists')
            corpus = MmCorpus("./lsi/lsi_corpus.mm")

        self.tfidf = models.TfidfModel(corpus)
        if (not os.path.isfile('./lsi/lsi_tf_corpus.mm')):
            print('creating tf corpus')
            tf_corp = self.tfidf[corpus]
            MmCorpus.serialize("lsi/lsi_tf_corpus.mm", tf_corp)
        else:
            print('tf corpus already exists')
            tf_corp = MmCorpus("./lsi/lsi_tf_corpus.mm")

        # Make a index to word dictionary.
        temp = dictionary[0]  # This is only to "load" the dictionary.
        id2word = dictionary.id2token

        #Create the models and vectors
        if (not os.path.isfile('./lsi/lsi_bow_model' + str(topic_number) +
                               '.model')):
            print('creating bow model')
            bow_model = models.LsiModel(corpus=corpus,
                                        num_topics=topic_number,
                                        id2word=id2word)
            bow_model.save('lsi/lsi_bow_model' + str(topic_number) + '.model')
        else:
            print('bow model already exists')
            bow_model = models.LsiModel.load('./lsi/lsi_bow_model' +
                                             str(topic_number) + '.model')
        bow_vector = bow_model[corpus]
        self.bow_model = bow_model

        if (not os.path.isfile('./lsi/lsi_tf_model' + str(topic_number) +
                               '.model')):
            print('creating tfidf model')
            tf_model = models.LsiModel(corpus=tf_corp,
                                       num_topics=topic_number,
                                       id2word=id2word)
            tf_model.save('./lsi/lsi_tf_model' + str(topic_number) + '.model')
        else:
            print('tfidf model already exists')
            tf_model = models.LsiModel.load('./lsi/lsi_tf_model' +
                                            str(topic_number) + '.model')
        tf_vector = tf_model[tf_corp]
        self.tf_model = tf_model

        #Create indices
        if (not os.path.isfile('./lsi/lsi_bow_model' + str(topic_number) +
                               '.index')):
            print('creating bow index')
            bow_index = similarities.MatrixSimilarity(
                bow_vector)  # index corpus in bow LSI space
            bow_index.save('lsi/lsi_bow_model' + str(topic_number) + '.index')
        else:
            print('bow index already exists')
            bow_index = similarities.MatrixSimilarity.load(
                './lsi/lsi_bow_model' + str(topic_number) + '.index')
        self.bow_index = bow_index

        if (not os.path.isfile('./lsi/lsi_tf_model' + str(topic_number) +
                               '.index')):
            print('creating tf index')
            tf_index = similarities.MatrixSimilarity(
                tf_vector)  # index corpus in tf LSI space
            tf_index.save('lsi/lsi_tf_model' + str(topic_number) + '.index')
        else:
            print('tf index already exists')
            tf_index = similarities.MatrixSimilarity.load(
                './lsi/lsi_tf_model' + str(topic_number) + '.index')
        self.tf_index = tf_index
        print('model created!')

Esempio n. 15

0

Mostra file

File: build_corpus_tdm.py Progetto: thefirebanks/policy-data-analyzer

     files. Only `.txt` files will be taken into account.
 dataset_name : str
     A name for the directory where the processed corpus is to be placed.
 """
 # Parse command line arguments
 parser = argparse.ArgumentParser()
 parser.add_argument("-docs_dir", type=str)
 parser.add_argument("-dataset_name", type=str)
 args = vars(parser.parse_args())
 documents_dir = args["docs_dir"]
 dataset_name = args["dataset_name"]
 document_paths = [
     os.path.join(documents_dir, d) for d in os.listdir(documents_dir)
     if d.endswith(".txt")
 ]
 # Write document index to id mapping
 doc_ids = [
     d.replace(".txt", "") for d in os.listdir(documents_dir)
     if d.endswith(".txt")
 ]
 doc_idxs = {i: doc_ids[i] for i in range(len(doc_ids))}
 os.makedirs(os.path.dirname(doc_idxs_file.format(id=dataset_name)))
 with open(doc_idxs_file.format(id=dataset_name), "w") as f:
     json.dump(doc_idxs, f, indent=2)
 # Create tokenizer and tokenize corpus in a single pass
 corpus_file = corpus_file.format(id=dataset_name)
 tokenizer_file = tokenizer_file.format(id=dataset_name)
 os.makedirs(os.path.dirname(corpus_file), exist_ok=True)
 corpus_builder = IterativeCorpusBuilder(document_paths, 2 * 10**6)
 MmCorpus.serialize(fname=corpus_file, corpus=corpus_builder)
 corpus_builder.tokenizer.save(tokenizer_file)

Esempio n. 16

0

Mostra file

        metadata_queue = []

        class MetadataRemovedCorpus:
            def __init__(self, corpus):
                self.corpus = corpus

            def __iter__(self):
                for doc, metadata in self.corpus:
                    metadata_queue.append(metadata)
                    yield doc

        tfidf_corpus = tfidf[MetadataRemovedCorpus(wiki_corpus)]

        class MetadataAddedCorpus:
            def __init__(self, corpus):
                self.corpus = corpus
                self.metadata = True

            def __iter__(self):
                for doc in self.corpus:
                    yield doc, metadata_queue.pop()

        tfidf_metadata_corpus = MetadataAddedCorpus(tfidf_corpus)

        if vector_format == 'tfidf':
            corpus = tfidf_metadata_corpus
        elif vector_format == 'bow':
            corpus = wiki_corpus

        MmCorpus.serialize(mm_fname, corpus, progress_cnt=10000, metadata=True)

Esempio n. 17

0

Mostra file

File: tut4.py Progetto: aweinstein/scrapcode

from gensim.corpora import WikiCorpus
from gensim.corpora.mmcorpus import MmCorpus

if __name__ == '__main__':

    # Log both to a file and the console
    log_name = 'tut4.log'
    logger = logging.getLogger()
    logger.setLevel(logging.INFO)
    fh = logging.FileHandler(log_name)
    fh.setLevel(logging.INFO)
    ch = logging.StreamHandler()
    ch.setLevel(logging.INFO)
    formatter = logging.Formatter('%(asctime)s : %(levelname)s : %(message)s')
    ch.setFormatter(formatter)
    fh.setFormatter(formatter)
    logger.addHandler(ch)
    logger.addHandler(fh)

    logger.info("running %s" % ' '.join(sys.argv))

    fname = 'simplewiki-20120313-pages-articles.xml.bz2'
    wiki = WikiCorpus(fname)
    # save dictionary and bag-of-words (term-document frequency matrix)

    output = 'simple_wiki'
    wiki.dictionary.save_as_text(output + '_wordids.txt')
    MmCorpus.serialize(output + '_bow.mm', wiki, progress_cnt=10000)
    del wiki

Esempio n. 18

0

Mostra file

    # optional argv[3] = keep_words
    if len(sys.argv) < 3:
        print globals()['__doc__'] % locals()
        #sys.exit(1)
    input, output = sys.argv[1:3]
    if len(sys.argv) > 3:
        keep_words = int(sys.argv[3])
    else:
        keep_words = DEFAULT_DICT_SIZE

    # build dictionary. only keep 100k most frequent words (out of total ~900k unique tokens)
    enron = EnronCorpus(input, keep_words=keep_words)

    # save dictionary and bag-of-words (term-document frequency matrix)
    enron.dictionary.save_as_text(output + '_wordids.txt')
    MmCorpus.serialize(output + '_bow.mm', enron, progress_cnt=10000)
    del enron

    # initialize corpus reader and word->id mapping
    id2token = Dictionary.load_from_text(output + '_wordids.txt')
    mm = MmCorpus(output + '_bow.mm')

    # build tfidf
    from gensim.models import TfidfModel
    tfidf = TfidfModel(mm, id2word=id2token, normalize=True)

    # save tfidf vectors in matrix market format
    MmCorpus.serialize(output + '_tfidf.mm', tfidf[mm], progress_cnt=10000)

    logger.info("finished running %s" % program)

Esempio n. 19

0

Mostra file

File: lda.py Progetto: msushkov/cs224w-wiki

def build_corpus(dictionary):
    MmCorpus.serialize(CORPUS_FILE, BowCorpus(wiki_index.ARTICLES_FILE, dictionary))
    return MmCorpus(CORPUS_FILE)