Esempio n. 1
    def __init__(self, fname, fname_vocab=None):
        Initialize the corpus from a file.

        `fname_vocab` is the file with vocabulary; if not specified, it defaults to
        IndexedCorpus.__init__(self, fname)"loading corpus from %s" % fname)

        if fname_vocab is None:
            fname_base, _ = path.splitext(fname)
            fname_dir = path.dirname(fname)
            for fname_vocab in [
                        fname + '.vocab',
                        fname + '/vocab.txt',
                        fname_base + '.vocab',
                        fname_dir + '/vocab.txt',
                if path.exists(fname_vocab):
                raise IOError('BleiCorpus: could not find vocabulary file')

        self.fname = fname
        with utils.smart_open(fname_vocab) as fin:
            words = [utils.to_unicode(word).rstrip() for word in fin]
        self.id2word = dict(enumerate(words))
        self.length = None
Esempio n. 2
    def __init__(self, fname, fname_vocab=None):
        fname : str
            Path to corpus in UCI format.
        fname_vocab : bool, optional
            Path to vocab.

        .. sourcecode:: pycon

            >>> from gensim.corpora import UciCorpus
            >>> from gensim.test.utils import datapath
            >>> corpus = UciCorpus(datapath('testcorpus.uci'))
            >>> for document in corpus:
            ...     pass

        IndexedCorpus.__init__(self, fname)
        UciReader.__init__(self, fname)

        if fname_vocab is None:
            fname_vocab = utils.smart_extension(fname, '.vocab')

        self.fname = fname
        with, 'rb') as fin:
            words = [word.strip() for word in fin]
        self.id2word = dict(enumerate(words))

        self.transposed = True
Esempio n. 3
    def __init__(self, fname, fname_vocab=None):
        fname : str
            Path to corpus in UCI format.
        fname_vocab : bool, optional
            Path to vocab.

        >>> from gensim.corpora import UciCorpus
        >>> from gensim.test.utils import datapath
        >>> corpus = UciCorpus(datapath('testcorpus.uci'))
        >>> for document in corpus:
        ...     pass

        IndexedCorpus.__init__(self, fname)
        UciReader.__init__(self, fname)

        if fname_vocab is None:
            fname_vocab = utils.smart_extension(fname, '.vocab')

        self.fname = fname
        with utils.smart_open(fname_vocab) as fin:
            words = [word.strip() for word in fin]
        self.id2word = dict(enumerate(words))

        self.transposed = True
Esempio n. 4
    def __init__(self, fname, fname_vocab=None):
        Initialize the corpus from a file.

        `fname_vocab` is the file with vocabulary; if not specified, it defaults to
        IndexedCorpus.__init__(self, fname)"loading corpus from %s" % fname)

        if fname_vocab is None:
            fname_base, _ = path.splitext(fname)
            fname_dir = path.dirname(fname)
            for fname_vocab in [
                    fname + '.vocab',
                    fname + '/vocab.txt',
                    fname_base + '.vocab',
                    fname_dir + '/vocab.txt',
                if path.exists(fname_vocab):
                raise IOError('BleiCorpus: could not find vocabulary file')

        self.fname = fname
        with utils.smart_open(fname_vocab) as fin:
            words = [utils.to_unicode(word).rstrip() for word in fin]
        self.id2word = dict(enumerate(words))
Esempio n. 5
 def __init__(self, fname: str):
     Initialize the corpus from an existing file.
     IndexedCorpus.__init__(self, fname)"loading corpus from %s" % fname)
     self.fname = fname
     self.length = None
Esempio n. 6
    def __init__(self, fname):
        Initialize the corpus from a file.
        IndexedCorpus.__init__(self, fname)"loading corpus from %s" % fname)

        self.fname = fname  # input file, see class doc for format
        self.length = None
Esempio n. 7
    def __init__(self, fname):
        Initialize the corpus from a file.
        IndexedCorpus.__init__(self, fname)"loading corpus from %s" % fname)

        self.fname = fname # input file, see class doc for format
        self.length = None
Esempio n. 8
    def __init__(self, fname, fname_vocab=None):
        IndexedCorpus.__init__(self, fname)
        UciReader.__init__(self, fname)

        if fname_vocab is None:
            fname_vocab = fname + '.vocab'

        self.fname = fname
        words = [word.strip() for word in open(fname_vocab)]
        self.id2word = dict(enumerate(words))

        self.transposed = True
Esempio n. 9
    def __init__(self, fname, fname_vocab=None):
        IndexedCorpus.__init__(self, fname)
        UciReader.__init__(self, fname)

        if fname_vocab is None:
            fname_vocab = fname + '.vocab'

        self.fname = fname
        words = [word.strip() for word in open(fname_vocab)]
        self.id2word = dict(enumerate(words))

        self.transposed = True
Esempio n. 10
    def __init__(self, fname, fname_vocab=None):
        IndexedCorpus.__init__(self, fname)
        UciReader.__init__(self, fname)

        if fname_vocab is None:
            fname_vocab = utils.smart_extension(fname, '.vocab')

        self.fname = fname
        with utils.smart_open(fname_vocab) as fin:
            words = [word.strip() for word in fin]
        self.id2word = dict(enumerate(words))

        self.transposed = True
Esempio n. 11
    def __init__(self, fname):

        fname : {str, file-like object}
            Path to file in MM format or a file-like object that supports `seek()`
            (e.g. a compressed file opened by `smart_open <>`_).

        # avoid calling super(), too confusing
        IndexedCorpus.__init__(self, fname)
        matutils.MmReader.__init__(self, fname)
Esempio n. 12
    def __init__(self, fname):

        fname : {str, file-like object}
            Path to file in MM format or a file-like object that supports `seek()`
            (e.g. a compressed file opened by `smart_open <>`_).

        # avoid calling super(), too confusing
        IndexedCorpus.__init__(self, fname)
        matutils.MmReader.__init__(self, fname)
Esempio n. 13
    def __init__(self, fname, fname_vocab=None):
        IndexedCorpus.__init__(self, fname)
        UciReader.__init__(self, fname)

        if fname_vocab is None:
            fname_vocab = utils.smart_extension(fname, '.vocab')

        self.fname = fname
        with utils.smart_open(fname_vocab) as fin:
            words = [word.strip() for word in fin]
        self.id2word = dict(enumerate(words))

        self.transposed = True
Esempio n. 14
    def __init__(self, fname):

        fname : {str, file-like object}
            Path to file in MM format or a file-like object that supports `seek()`
            (e.g. :class:`gzip.GzipFile`, :class:`bz2.BZ2File`).


        # avoid calling super(), too confusing
        IndexedCorpus.__init__(self, fname)
        matutils.MmReader.__init__(self, fname)
Esempio n. 15
    def __init__(self, fname):

        fname : {str, file-like object}
            Path to file in MM format or a file-like object that supports `seek()`
            (e.g. :class:`gzip.GzipFile`, :class:`bz2.BZ2File`).


        # avoid calling super(), too confusing
        IndexedCorpus.__init__(self, fname)
        matutils.MmReader.__init__(self, fname)
Esempio n. 16
    def __init__(self, fname, fnameVocab=None):
        Initialize the corpus from a file.

        `fnameVocab` is the file with vocabulary; if not specified, it defaults to
        IndexedCorpus.__init__(self, fname)"loading corpus from %s" % fname)

        if fnameVocab is None:
            fnameVocab = fname + '.vocab'

        self.fname = fname
        words = [word.rstrip() for word in open(fnameVocab)]
        self.id2word = dict(enumerate(words))
        self.length = None
Esempio n. 17
    def __init__(self, fname, fnameVocab=None):
        Initialize the corpus from a file.

        `fnameVocab` is the file with vocabulary; if not specified, it defaults to
        IndexedCorpus.__init__(self, fname)"loading corpus from %s" % fname)

        if fnameVocab is None:
            fnameVocab = fname + '.vocab'

        self.fname = fname
        words = [word.rstrip() for word in open(fnameVocab)]
        self.id2word = dict(enumerate(words))
        self.length = None
Esempio n. 18
    def __init__(self, fname, store_labels=True):
        Initialize the corpus from a file.

        Although vector labels (~SVM target class) are not used in gensim in any way,
        they are parsed and stored in `self.labels` for convenience. Set `store_labels=False`
        to skip storing these labels (e.g. if there are too many vectors to store
        the self.labels array in memory).

        IndexedCorpus.__init__(self, fname)"loading corpus from %s" % fname)

        self.fname = fname  # input file, see class doc for format
        self.length = None
        self.store_labels = store_labels
        self.labels = []
Esempio n. 19
    def __init__(self, fname, store_labels=True):
        Initialize the corpus from a file.

        Although vector labels (~SVM target class) are not used in gensim in any way,
        they are parsed and stored in `self.labels` for convenience. Set `store_labels=False`
        to skip storing these labels (e.g. if there are too many vectors to store
        the self.labels array in memory).

        IndexedCorpus.__init__(self, fname)"loading corpus from %s" % fname)

        self.fname = fname # input file, see class doc for format
        self.length = None
        self.store_labels = store_labels
        self.labels = []
Esempio n. 20
    def __init__(self, fname, id2word=None, line2words=splitOnSpace):
        Initialize the corpus from a file.

        `id2word` and `line2words` are optional parameters.

        If provided, `id2word` is a dictionary mapping between wordIds (integers)
        and words (strings). If not provided, the mapping is constructed from
        the documents.

        `line2words` is a function which converts lines into tokens. Defaults to
        simple splitting on spaces.
        IndexedCorpus.__init__(self, fname)"loading corpus from %s" % fname)

        self.fname = fname  # input file, see class doc for format
        self.line2words = line2words  # how to translate lines into words (simply split on space by default)
        self.numDocs = int(
        )  # the first line in input data is the number of documents (integer). throws exception on bad input.

        if not id2word:
            # build a list of all word types in the corpus (distinct words)
  "extracting vocabulary from the corpus")
            allTerms = set()
            self.useWordIds = False  # return documents as (word, wordCount) 2-tuples
            for doc in self:
                allTerms.update(word for word, wordCnt in doc)
            allTerms = sorted(
            )  # sort the list of all words; rank in that list = word's integer id
            self.id2word = dict(zip(
                allTerms))  # build a mapping of word id(int) -> word (string)
  "using provided word mapping (%i ids)" % len(id2word))
            self.id2word = id2word
        self.word2id = dict((v, k) for k, v in self.id2word.iteritems())
        self.numTerms = len(self.word2id)
        self.useWordIds = True  # return documents as (wordIndex, wordCount) 2-tuples"loaded corpus with %i documents and %i terms from %s" %
                     (self.numDocs, self.numTerms, fname))
    def __init__(self, fname, store_labels=True):

        fname: str
            Path to corpus.
        store_labels : bool, optional
            Whether to store labels (~SVM target class). They currently have no application but stored
            in `self.labels` for convenience by default.

        IndexedCorpus.__init__(self, fname)"loading corpus from %s", fname)

        self.fname = fname  # input file, see class doc for format
        self.length = None
        self.store_labels = store_labels
        self.labels = []
Esempio n. 22
    def __init__(self, fname, fname_vocab=None):

        fname : str
            Path to corpus.
        fname_vocab : str, optional
            Vocabulary file. If `fname_vocab` is None, searching one of variants:

            * `fname`.vocab
            * `fname`/vocab.txt
            * `fname_without_ext`.vocab
            * `fname_folder`/vocab.txt

            If vocabulary file doesn't exist.

        IndexedCorpus.__init__(self, fname)"loading corpus from %s", fname)

        if fname_vocab is None:
            fname_base, _ = path.splitext(fname)
            fname_dir = path.dirname(fname)
            for fname_vocab in [
                        utils.smart_extension(fname, '.vocab'),
                        utils.smart_extension(fname, '/vocab.txt'),
                        utils.smart_extension(fname_base, '.vocab'),
                        utils.smart_extension(fname_dir, '/vocab.txt'),
                if path.exists(fname_vocab):
                raise IOError('BleiCorpus: could not find vocabulary file')

        self.fname = fname
        with utils.smart_open(fname_vocab) as fin:
            words = [utils.to_unicode(word).rstrip() for word in fin]
        self.id2word = dict(enumerate(words))
Esempio n. 23
    def __init__(self, fname, fname_vocab=None):

        fname : str
            Path to corpus.
        fname_vocab : str, optional
            Vocabulary file. If `fname_vocab` is None, searching one of variants:

            * `fname`.vocab
            * `fname`/vocab.txt
            * `fname_without_ext`.vocab
            * `fname_folder`/vocab.txt

            If vocabulary file doesn't exist.

        IndexedCorpus.__init__(self, fname)"loading corpus from %s", fname)

        if fname_vocab is None:
            fname_base, _ = path.splitext(fname)
            fname_dir = path.dirname(fname)
            for fname_vocab in [
                    utils.smart_extension(fname, '.vocab'),
                    utils.smart_extension(fname, '/vocab.txt'),
                    utils.smart_extension(fname_base, '.vocab'),
                    utils.smart_extension(fname_dir, '/vocab.txt'),
                if path.exists(fname_vocab):
                raise IOError('BleiCorpus: could not find vocabulary file')

        self.fname = fname
        with, 'rb') as fin:
            words = [utils.to_unicode(word).rstrip() for word in fin]
        self.id2word = dict(enumerate(words))
Esempio n. 24
    def __init__(self, fname, id2word=None, line2words=split_on_space):

        fname : str
            Path to file in GibbsLda++ format.
        id2word : {dict of (int, str), :class:`~gensim.corpora.dictionary.Dictionary`}, optional
            Mapping between word_ids (integers) and words (strings).
            If not provided, the mapping is constructed directly from `fname`.
        line2words : callable, optional
            Function which converts lines(str) into tokens(list of str),
            using :func:`~gensim.corpora.lowcorpus.split_on_space` as default.

        IndexedCorpus.__init__(self, fname)"loading corpus from %s", fname)

        self.fname = fname  # input file, see class doc for format
        self.line2words = line2words  # how to translate lines into words (simply split on space by default)
        self.num_docs = self._calculate_num_docs()

        if not id2word:
            # build a list of all word types in the corpus (distinct words)
  "extracting vocabulary from the corpus")
            all_terms = set()
            self.use_wordids = False  # return documents as (word, wordCount) 2-tuples
            for doc in self:
                all_terms.update(word for word, wordCnt in doc)
            all_terms = sorted(all_terms)  # sort the list of all words; rank in that list = word's integer id
            # build a mapping of word id(int) -> word (string)
            self.id2word = dict(zip(range(len(all_terms)), all_terms))
  "using provided word mapping (%i ids)", len(id2word))
            self.id2word = id2word
        self.num_terms = len(self.word2id)
        self.use_wordids = True  # return documents as (wordIndex, wordCount) 2-tuples
            "loaded corpus with %i documents and %i terms from %s",
            self.num_docs, self.num_terms, fname
Esempio n. 25
    def __init__(self, fname, id2word=None, line2words=split_on_space):

        fname : str
            Path to file in GibbsLda++ format.
        id2word : {dict of (int, str), :class:`~gensim.corpora.dictionary.Dictionary`}, optional
            Mapping between word_ids (integers) and words (strings).
            If not provided, the mapping is constructed directly from `fname`.
        line2words : callable, optional
            Function which converts lines(str) into tokens(list of str),
            using :func:`~gensim.corpora.lowcorpus.split_on_space` as default.

        IndexedCorpus.__init__(self, fname)"loading corpus from %s", fname)

        self.fname = fname  # input file, see class doc for format
        self.line2words = line2words  # how to translate lines into words (simply split on space by default)
        self.num_docs = self._calculate_num_docs()

        if not id2word:
            # build a list of all word types in the corpus (distinct words)
  "extracting vocabulary from the corpus")
            all_terms = set()
            self.use_wordids = False  # return documents as (word, wordCount) 2-tuples
            for doc in self:
                all_terms.update(word for word, wordCnt in doc)
            all_terms = sorted(all_terms)  # sort the list of all words; rank in that list = word's integer id
            # build a mapping of word id(int) -> word (string)
            self.id2word = dict(zip(range(len(all_terms)), all_terms))
  "using provided word mapping (%i ids)", len(id2word))
            self.id2word = id2word
        self.num_terms = len(self.word2id)
        self.use_wordids = True  # return documents as (wordIndex, wordCount) 2-tuples
            "loaded corpus with %i documents and %i terms from %s",
            self.num_docs, self.num_terms, fname
Esempio n. 26
    def __init__(self, fname, id2word=None, line2words=splitOnSpace):
        Initialize the corpus from a file.

        `id2word` and `line2words` are optional parameters.

        If provided, `id2word` is a dictionary mapping between wordIds (integers)
        and words (strings). If not provided, the mapping is constructed from
        the documents.

        `line2words` is a function which converts lines into tokens. Defaults to
        simple splitting on spaces.
        IndexedCorpus.__init__(self, fname)"loading corpus from %s" % fname)

        self.fname = fname  # input file, see class doc for format
        self.line2words = line2words  # how to translate lines into words (simply split on space by default)
        self.numDocs = int(
        )  # the first line in input data is the number of documents (integer). throws exception on bad input.

        if not id2word:
            # build a list of all word types in the corpus (distinct words)
  "extracting vocabulary from the corpus")
            allTerms = set()
            self.useWordIds = False  # return documents as (word, wordCount) 2-tuples
            for doc in self:
                allTerms.update(word for word, wordCnt in doc)
            allTerms = sorted(allTerms)  # sort the list of all words; rank in that list = word's integer id
            self.id2word = dict(
                zip(xrange(len(allTerms)), allTerms)
            )  # build a mapping of word id(int) -> word (string)
  "using provided word mapping (%i ids)" % len(id2word))
            self.id2word = id2word
        self.word2id = dict((v, k) for k, v in self.id2word.iteritems())
        self.numTerms = len(self.word2id)
        self.useWordIds = True  # return documents as (wordIndex, wordCount) 2-tuples"loaded corpus with %i documents and %i terms from %s" % (self.numDocs, self.numTerms, fname))
Esempio n. 27
    def __init__(self, fname, id2word=None, line2words=split_on_space):
        Initialize the corpus from a file.

        `id2word` and `line2words` are optional parameters.
        If provided, `id2word` is a dictionary mapping between word_ids (integers)
        and words (strings). If not provided, the mapping is constructed from
        the documents.

        `line2words` is a function which converts lines into tokens. Defaults to
        simple splitting on spaces.
        IndexedCorpus.__init__(self, fname)"loading corpus from %s", fname)

        self.fname = fname  # input file, see class doc for format
        self.line2words = line2words  # how to translate lines into words (simply split on space by default)
        self.num_docs = self._calculate_num_docs()

        if not id2word:
            # build a list of all word types in the corpus (distinct words)
  "extracting vocabulary from the corpus")
            all_terms = set()
            self.use_wordids = False  # return documents as (word, wordCount) 2-tuples
            for doc in self:
                all_terms.update(word for word, wordCnt in doc)
            all_terms = sorted(
            )  # sort the list of all words; rank in that list = word's integer id
            self.id2word = dict(izip(
                all_terms))  # build a mapping of word id(int) -> word (string)
  "using provided word mapping (%i ids)", len(id2word))
            self.id2word = id2word
        self.num_terms = len(self.word2id)
        self.use_wordids = True  # return documents as (wordIndex, wordCount) 2-tuples"loaded corpus with %i documents and %i terms from %s",
                    self.num_docs, self.num_terms, fname)
Esempio n. 28
    def __init__(self, fname, id2word=None, line2words=split_on_space):
        Initialize the corpus from a file.

        `id2word` and `line2words` are optional parameters.
        If provided, `id2word` is a dictionary mapping between word_ids (integers)
        and words (strings). If not provided, the mapping is constructed from
        the documents.

        `line2words` is a function which converts lines into tokens. Defaults to
        simple splitting on spaces.
        IndexedCorpus.__init__(self, fname)"loading corpus from %s", fname)

        self.fname = fname  # input file, see class doc for format
        self.line2words = line2words  # how to translate lines into words (simply split on space by default)
        self.num_docs = self._calculate_num_docs()

        if not id2word:
            # build a list of all word types in the corpus (distinct words)
  "extracting vocabulary from the corpus")
            all_terms = set()
            self.use_wordids = False  # return documents as (word, wordCount) 2-tuples
            for doc in self:
                all_terms.update(word for word, wordCnt in doc)
            all_terms = sorted(all_terms)  # sort the list of all words; rank in that list = word's integer id
            # build a mapping of word id(int) -> word (string)
            self.id2word = dict(izip(xrange(len(all_terms)), all_terms))
  "using provided word mapping (%i ids)", len(id2word))
            self.id2word = id2word
        self.num_terms = len(self.word2id)
        self.use_wordids = True  # return documents as (wordIndex, wordCount) 2-tuples
            "loaded corpus with %i documents and %i terms from %s",
            self.num_docs, self.num_terms, fname
Esempio n. 29
 def __init__(self, fname):
     # avoid calling super(), too confusing
     IndexedCorpus.__init__(self, fname)
     matutils.MmReader.__init__(self, fname)
Esempio n. 30
 def __init__(self, fname):
     # avoid calling super(), too confusing
     IndexedCorpus.__init__(self, fname)
     matutils.MmReader.__init__(self, fname)