def __init__(self, encoding='utf8', sourceType='string', contentType='text', lowercase=True, keepTexts=True, keepTokens=True, keepPositions=True): """ sourceType - document created from string passed as parameter/from filename passed as parameter contentType - type of text (used for domain knowledge in tokenization etc) """ from tokenizers import TokenizerFactory self.tf = TokenizerFactory() self.tokenizer = self.tf.createTokenizer(contentType, encoding, lowercase) self.sourceType = sourceType self.contentType = contentType self.keepTexts = keepTexts self.keepTokens = keepTokens self.keepPositions = keepPositions
def __init__(self, encoding = 'utf8', sourceType = 'string', contentType = 'text', lowercase = True, keepTexts = True, keepTokens = True, keepPositions = True): """ sourceType - document created from string passed as parameter/from filename passed as parameter contentType - type of text (used for domain knowledge in tokenization etc) """ from tokenizers import TokenizerFactory self.tf = TokenizerFactory() self.tokenizer = self.tf.createTokenizer(contentType, encoding, lowercase) self.sourceType = sourceType self.contentType = contentType self.keepTexts = keepTexts self.keepTokens = keepTokens self.keepPositions = keepPositions
class DocumentFactory: """create a document according to interface set at constructor. concrete document/tokenizer class is not specified in advance.""" def __init__(self, encoding = 'utf8', sourceType = 'string', contentType = 'text', lowercase = True, keepTexts = True, keepTokens = True, keepPositions = True): """ sourceType - document created from string passed as parameter/from filename passed as parameter contentType - type of text (used for domain knowledge in tokenization etc) """ from tokenizers import TokenizerFactory self.tf = TokenizerFactory() self.tokenizer = self.tf.createTokenizer(contentType, encoding, lowercase) self.sourceType = sourceType self.contentType = contentType self.keepTexts = keepTexts self.keepTokens = keepTokens self.keepPositions = keepPositions def createDocument(self, source, docid = None, dictionary = None): """ source - either text as string or filename (if sourceType=='file') docid - document id or filename """ if self.sourceType == 'file': if docid == None: docid = source ## docid = os.path.basename(source) source = utils_dml.readfile(source) ## logging.debug("creating document %s" % str(docid)) result = Document(docid) if self.keepTexts: result.setText(source) if self.keepTokens or self.keepPositions or dictionary != None: if self.keepPositions: tokens, pos = self.tokenizer.tokenize(source, returnPositions = self.keepPositions) else: tokens = self.tokenizer.tokenize(source, returnPositions = self.keepPositions) if self.keepTokens: result.setTokens(tokens) if self.keepPositions: result.setTokenPositions(pos) if dictionary != None: newwords = {} result.setTokenIds(utils_dml.text2vect(tokens, dictionary, newwords)) ## print 'for %s added %i (new length = %i) ' % (docid, len(newwords), len(dictionary)) return result def createDocuments(self, sources, start = None, dictionary = None): """ if sourceType == 'text' then docid will be 'start + sequence index in sources' if sourceType == 'file' and start == None, docid will be the filename as returned by os.path.basename() """ result = [] doNames = self.sourceType == 'file' and start == None if start == None: start = 0 cnt = 0 for source in sources: if (cnt + 1) % 1000 == 0: logging.debug("progress: doc#%i" % cnt) cnt += 1 if doNames: doc = self.createDocument(source, None, dictionary = dictionary) else: doc = self.createDocument(source, start, dictionary = dictionary) result.append(doc) start += 1 return result def getTokenizer(self): return self.tokenizer
class DocumentFactory: """create a document according to interface set at constructor. concrete document/tokenizer class is not specified in advance.""" def __init__(self, encoding='utf8', sourceType='string', contentType='text', lowercase=True, keepTexts=True, keepTokens=True, keepPositions=True): """ sourceType - document created from string passed as parameter/from filename passed as parameter contentType - type of text (used for domain knowledge in tokenization etc) """ from tokenizers import TokenizerFactory self.tf = TokenizerFactory() self.tokenizer = self.tf.createTokenizer(contentType, encoding, lowercase) self.sourceType = sourceType self.contentType = contentType self.keepTexts = keepTexts self.keepTokens = keepTokens self.keepPositions = keepPositions def createDocument(self, source, docid=None, dictionary=None): """ source - either text as string or filename (if sourceType=='file') docid - document id or filename """ if self.sourceType == 'file': if docid == None: docid = source ## docid = os.path.basename(source) source = utils_dml.readfile(source) ## logging.debug("creating document %s" % str(docid)) result = Document(docid) if self.keepTexts: result.setText(source) if self.keepTokens or self.keepPositions or dictionary != None: if self.keepPositions: tokens, pos = self.tokenizer.tokenize( source, returnPositions=self.keepPositions) else: tokens = self.tokenizer.tokenize( source, returnPositions=self.keepPositions) if self.keepTokens: result.setTokens(tokens) if self.keepPositions: result.setTokenPositions(pos) if dictionary != None: newwords = {} result.setTokenIds( utils_dml.text2vect(tokens, dictionary, newwords)) ## print 'for %s added %i (new length = %i) ' % (docid, len(newwords), len(dictionary)) return result def createDocuments(self, sources, start=None, dictionary=None): """ if sourceType == 'text' then docid will be 'start + sequence index in sources' if sourceType == 'file' and start == None, docid will be the filename as returned by os.path.basename() """ result = [] doNames = self.sourceType == 'file' and start == None if start == None: start = 0 cnt = 0 for source in sources: if (cnt + 1) % 1000 == 0: logging.debug("progress: doc#%i" % cnt) cnt += 1 if doNames: doc = self.createDocument(source, None, dictionary=dictionary) else: doc = self.createDocument(source, start, dictionary=dictionary) result.append(doc) start += 1 return result def getTokenizer(self): return self.tokenizer