예제 #1
0
def LoadBagOfWordsDataFromFile_ldac_cython(filepath,
                                           nTokensPerByte=0.2,
                                           **kwargs):
    if not hasCythonExt:
        warnings.warn("Cython extension TextFileReaderX not found." +
                      " Falling back to pure python.")
        return BagOfWordsData.LoadFromFile_ldac_python(filepath, **kwargs)
    filesize_bytes = os.path.getsize(filepath)
    nUniqueTokens = int(nTokensPerByte * filesize_bytes)
    try:
        dptr = np.zeros(nUniqueTokens, dtype=np.int32)
        wids = np.zeros(nUniqueTokens, dtype=np.int32)
        wcts = np.zeros(nUniqueTokens, dtype=np.float64)
        stop, dstop = read_from_ldac_file(filepath, nUniqueTokens, dptr, wids,
                                          wcts)
        return BagOfWordsData(word_id=wids[:stop],
                              word_count=wcts[:stop],
                              doc_range=dptr[:dstop],
                              **kwargs)
    except IndexError as e:
        return LoadBagOfWordsDataFromFile_ldac(filepath, nTokensPerByte * 2,
                                               **kwargs)
예제 #2
0
    def loadDataForBatch(self, batchID):
        ''' Load the data assigned to a particular batch

        Returns
        -------
        Dchunk : bnpy.data.DataObj subclass
        '''
        dpath = self.datafileList[batchID]
        if dpath.endswith('.ldac'):
            return BagOfWordsData.LoadFromFile_ldac(dpath, **self.DataInfo)
        elif self.dataset_type == 'GroupXData':
            return GroupXData.LoadFromFile(dpath, **self.DataInfo)
        else:
            return XData.read_file(dpath, **self.DataInfo)
예제 #3
0
def loadDataForSlice(filepath='', dataset_type='', **kwargs):
    """ Return data object loaded from specific file.

    Keyword args
    ------------
    workerID
    nWorkers
    """
    if filepath.endswith('.ldac'):
        return BagOfWordsData.LoadFromFile_ldac(filepath, **kwargs)
    else:
        if dataset_type == 'GroupXData':
            return GroupXData.LoadFromFile(filepath, **kwargs)
        else:
            return XData.LoadFromFile(filepath, **kwargs)
예제 #4
0
def LoadBagOfWordsDataFromFile_ldac_cython(filepath, nTokensPerByte=0.2, **kwargs):
    filesize_bytes = os.path.getsize(filepath)
    nUniqueTokens = int(nTokensPerByte *  filesize_bytes)
    try:
        dptr = np.zeros(nUniqueTokens, dtype=np.int32)
        wids = np.zeros(nUniqueTokens, dtype=np.int32)
        wcts = np.zeros(nUniqueTokens, dtype=np.float64)
        stop, dstop = read_from_ldac_file(
            filepath, nUniqueTokens, dptr, wids, wcts)
        return BagOfWordsData(
            word_id=wids[:stop],
            word_count=wcts[:stop],
            doc_range=dptr[:dstop],
            **kwargs)
    except IndexError as e:
        return LoadBagOfWordsDataFromFile_ldac(filepath, nTokensPerByte*2, **kwargs)
예제 #5
0
def LoadBagOfWordsDataFromFile_ldac_cython(filepath, nTokensPerByte=0.2, **kwargs):
    filesize_bytes = os.path.getsize(filepath)
    nUniqueTokens = int(nTokensPerByte *  filesize_bytes)
    try:
        dptr = np.zeros(nUniqueTokens, dtype=np.int32)
        wids = np.zeros(nUniqueTokens, dtype=np.int32)
        wcts = np.zeros(nUniqueTokens, dtype=np.float64)
        stop, dstop = read_from_ldac_file(
            filepath, nUniqueTokens, dptr, wids, wcts)
        return BagOfWordsData(
            word_id=wids[:stop],
            word_count=wcts[:stop],
            doc_range=dptr[:dstop],
            **kwargs)
    except IndexError as e:
        return LoadBagOfWordsDataFromFile_ldac(filepath, nTokensPerByte*2, **kwargs)

if __name__ == '__main__':
    #fpath = '/ltmp/testNYT.ldac'
    fpath = '/data/liv/textdatasets/nytimes/batches/batch111.ldac'
    vocab_size=8000

    fastD =  LoadBagOfWordsDataFromFile_ldac_cython(fpath, vocab_size=vocab_size)
    slowD = BagOfWordsData.LoadFromFile_ldac_python(fpath, vocab_size=vocab_size)
    print(fastD.word_id[:10])
    print(slowD.word_id[:10])
    assert np.allclose(fastD.word_id, slowD.word_id)
    assert np.allclose(fastD.word_count, slowD.word_count)
    assert np.allclose(fastD.doc_range, slowD.doc_range)