def __init__(self,
              encoding='utf8',
              sourceType='string',
              contentType='text',
              lowercase=True,
              keepTexts=True,
              keepTokens=True,
              keepPositions=True):
     """
     sourceType - document created from string passed as parameter/from filename passed as parameter
     contentType - type of text (used for domain knowledge in tokenization etc)
     """
     from tokenizers import TokenizerFactory
     self.tf = TokenizerFactory()
     self.tokenizer = self.tf.createTokenizer(contentType, encoding,
                                              lowercase)
     self.sourceType = sourceType
     self.contentType = contentType
     self.keepTexts = keepTexts
     self.keepTokens = keepTokens
     self.keepPositions = keepPositions
 def __init__(self, encoding = 'utf8', sourceType = 'string', contentType = 'text', lowercase = True,
              keepTexts = True, keepTokens = True, keepPositions = True):
     """
     sourceType - document created from string passed as parameter/from filename passed as parameter
     contentType - type of text (used for domain knowledge in tokenization etc)
     """
     from tokenizers import TokenizerFactory
     self.tf = TokenizerFactory()
     self.tokenizer = self.tf.createTokenizer(contentType, encoding, lowercase)
     self.sourceType = sourceType
     self.contentType = contentType
     self.keepTexts = keepTexts
     self.keepTokens = keepTokens
     self.keepPositions = keepPositions
class DocumentFactory:
    """create a document according to interface set at constructor. concrete document/tokenizer class is not specified in advance."""
    def __init__(self, encoding = 'utf8', sourceType = 'string', contentType = 'text', lowercase = True,
                 keepTexts = True, keepTokens = True, keepPositions = True):
        """
        sourceType - document created from string passed as parameter/from filename passed as parameter
        contentType - type of text (used for domain knowledge in tokenization etc)
        """
        from tokenizers import TokenizerFactory
        self.tf = TokenizerFactory()
        self.tokenizer = self.tf.createTokenizer(contentType, encoding, lowercase)
        self.sourceType = sourceType
        self.contentType = contentType
        self.keepTexts = keepTexts
        self.keepTokens = keepTokens
        self.keepPositions = keepPositions
        
    def createDocument(self, source, docid = None, dictionary = None):
        """
        source - either text as string or filename (if sourceType=='file')
        docid - document id or filename
        """
        if self.sourceType == 'file':
            if docid == None:
                docid = source
##                docid = os.path.basename(source)
            source = utils_dml.readfile(source)
##        logging.debug("creating document %s" % str(docid))
        result = Document(docid)
        if self.keepTexts:
            result.setText(source)
        if self.keepTokens or self.keepPositions or dictionary != None:
            if self.keepPositions:
                tokens, pos = self.tokenizer.tokenize(source, returnPositions = self.keepPositions)
            else:
                tokens = self.tokenizer.tokenize(source, returnPositions = self.keepPositions)
            if self.keepTokens:
                result.setTokens(tokens)
            if self.keepPositions:
                result.setTokenPositions(pos)
            if dictionary != None:
                newwords = {}
                result.setTokenIds(utils_dml.text2vect(tokens, dictionary, newwords))
##                print 'for %s added %i (new length = %i) ' % (docid, len(newwords), len(dictionary))
        return result

    def createDocuments(self, sources, start = None, dictionary = None):
        """
        if sourceType == 'text' then docid will be 'start + sequence index in sources'
        if sourceType == 'file' and start == None, docid will be the filename as returned by os.path.basename()
        """
        result = []
        doNames = self.sourceType == 'file' and start == None
        if start == None:
            start = 0

        cnt = 0
        for source in sources:
            if (cnt + 1) % 1000 == 0:
                logging.debug("progress: doc#%i" % cnt)
            cnt += 1
            if doNames:
                doc = self.createDocument(source, None, dictionary = dictionary)
            else:
                doc = self.createDocument(source, start, dictionary = dictionary)
            result.append(doc)
            start += 1

        return result

    def getTokenizer(self):
        return self.tokenizer
class DocumentFactory:
    """create a document according to interface set at constructor. concrete document/tokenizer class is not specified in advance."""
    def __init__(self,
                 encoding='utf8',
                 sourceType='string',
                 contentType='text',
                 lowercase=True,
                 keepTexts=True,
                 keepTokens=True,
                 keepPositions=True):
        """
        sourceType - document created from string passed as parameter/from filename passed as parameter
        contentType - type of text (used for domain knowledge in tokenization etc)
        """
        from tokenizers import TokenizerFactory
        self.tf = TokenizerFactory()
        self.tokenizer = self.tf.createTokenizer(contentType, encoding,
                                                 lowercase)
        self.sourceType = sourceType
        self.contentType = contentType
        self.keepTexts = keepTexts
        self.keepTokens = keepTokens
        self.keepPositions = keepPositions

    def createDocument(self, source, docid=None, dictionary=None):
        """
        source - either text as string or filename (if sourceType=='file')
        docid - document id or filename
        """
        if self.sourceType == 'file':
            if docid == None:
                docid = source
##                docid = os.path.basename(source)
            source = utils_dml.readfile(source)
##        logging.debug("creating document %s" % str(docid))
        result = Document(docid)
        if self.keepTexts:
            result.setText(source)
        if self.keepTokens or self.keepPositions or dictionary != None:
            if self.keepPositions:
                tokens, pos = self.tokenizer.tokenize(
                    source, returnPositions=self.keepPositions)
            else:
                tokens = self.tokenizer.tokenize(
                    source, returnPositions=self.keepPositions)
            if self.keepTokens:
                result.setTokens(tokens)
            if self.keepPositions:
                result.setTokenPositions(pos)
            if dictionary != None:
                newwords = {}
                result.setTokenIds(
                    utils_dml.text2vect(tokens, dictionary, newwords))
##                print 'for %s added %i (new length = %i) ' % (docid, len(newwords), len(dictionary))
        return result

    def createDocuments(self, sources, start=None, dictionary=None):
        """
        if sourceType == 'text' then docid will be 'start + sequence index in sources'
        if sourceType == 'file' and start == None, docid will be the filename as returned by os.path.basename()
        """
        result = []
        doNames = self.sourceType == 'file' and start == None
        if start == None:
            start = 0

        cnt = 0
        for source in sources:
            if (cnt + 1) % 1000 == 0:
                logging.debug("progress: doc#%i" % cnt)
            cnt += 1
            if doNames:
                doc = self.createDocument(source, None, dictionary=dictionary)
            else:
                doc = self.createDocument(source, start, dictionary=dictionary)
            result.append(doc)
            start += 1

        return result

    def getTokenizer(self):
        return self.tokenizer