def createDocument(self, source, docid = None, dictionary = None): """ source - either text as string or filename (if sourceType=='file') docid - document id or filename """ if self.sourceType == 'file': if docid == None: docid = source ## docid = os.path.basename(source) source = utils_dml.readfile(source) ## logging.debug("creating document %s" % str(docid)) result = Document(docid) if self.keepTexts: result.setText(source) if self.keepTokens or self.keepPositions or dictionary != None: if self.keepPositions: tokens, pos = self.tokenizer.tokenize(source, returnPositions = self.keepPositions) else: tokens = self.tokenizer.tokenize(source, returnPositions = self.keepPositions) if self.keepTokens: result.setTokens(tokens) if self.keepPositions: result.setTokenPositions(pos) if dictionary != None: newwords = {} result.setTokenIds(utils_dml.text2vect(tokens, dictionary, newwords)) ## print 'for %s added %i (new length = %i) ' % (docid, len(newwords), len(dictionary)) return result
def createDocument(self, source, docid=None, dictionary=None): """ source - either text as string or filename (if sourceType=='file') docid - document id or filename """ if self.sourceType == 'file': if docid == None: docid = source ## docid = os.path.basename(source) source = utils_dml.readfile(source) ## logging.debug("creating document %s" % str(docid)) result = Document(docid) if self.keepTexts: result.setText(source) if self.keepTokens or self.keepPositions or dictionary != None: if self.keepPositions: tokens, pos = self.tokenizer.tokenize( source, returnPositions=self.keepPositions) else: tokens = self.tokenizer.tokenize( source, returnPositions=self.keepPositions) if self.keepTokens: result.setTokens(tokens) if self.keepPositions: result.setTokenPositions(pos) if dictionary != None: newwords = {} result.setTokenIds( utils_dml.text2vect(tokens, dictionary, newwords)) ## print 'for %s added %i (new length = %i) ' % (docid, len(newwords), len(dictionary)) return result
def createDictionary(self): tokens = [] for doc in self.getDocs(): doctokens = doc.getTokens() if doctokens == None: logging.error("DocumentCollection.createDictionary called but keepTokens is False for %s" % doc.getId()) return {} tokens.extend(doctokens) self.dictionary = utils_dml.text2dict(tokens) for doc in self.getDocs(): doc.setTokenIds(utils_dml.text2vect(doc.getTokens(), self.dictionary))
def createDictionary(self): tokens = [] for doc in self.getDocs(): doctokens = doc.getTokens() if doctokens == None: logging.error( "DocumentCollection.createDictionary called but keepTokens is False for %s" % doc.getId()) return {} tokens.extend(doctokens) self.dictionary = utils_dml.text2dict(tokens) for doc in self.getDocs(): doc.setTokenIds( utils_dml.text2vect(doc.getTokens(), self.dictionary))
coll.removeDocument('m1') coll.addDocument(df.createDocument(".", "empty_doc")) coll.addDocument(df.createDocument("minors graph eps trees system computer survey user human time interface response.", "full_doc")) if not coll.docExists('m1'): coll.addDocument(df.createDocument(texts['m1'], 'brekeke')) coll.createDictionary() for doc in coll.getDocs(): print "dc1: %s (%i):" % (doc.getId(), len(doc.getTokenIds())), doc.getTokenIds() print coll.getDictionary() mat = coll.getBOWMatrix() dfm = coll.getDocFreqMap() stopList = ['a','and','of','the',':', 'totallyirrelevant'] # fixed stoplist stopIds = utils_dml.text2vect(stopList, coll.getDictionary()) stopIds.extend(coll.freqRange(dfm, 0, 2)) # extend stopIds with ids that have 0 <= document frequency < 2 print 'stoplist = ', map(utils_dml.reverseMap(coll.getDictionary()).__getitem__, stopIds) for doc in coll.getDocs(): print "before filter: %s (%i):" % (doc.getId(), len(doc.getTokenIds())), zip(doc.getTokenIds(), doc.getTokens()) coll.filterIds(stopIds) # remove unwanted tokens for doc in coll.getDocs(): print "after filter, before rebuild: %s (%i):" % (doc.getId(), len(doc.getTokenIds())), zip(doc.getTokenIds(), doc.getTokens()) coll.rebuildDictionary() for doc in coll.getDocs(): print "after rebuild: %s (%i):" % (doc.getId(), len(doc.getTokenIds())), zip(doc.getTokenIds(), doc.getTokens())
"full_doc")) if not coll.docExists('m1'): coll.addDocument(df.createDocument(texts['m1'], 'brekeke')) coll.createDictionary() for doc in coll.getDocs(): print "dc1: %s (%i):" % (doc.getId(), len( doc.getTokenIds())), doc.getTokenIds() print coll.getDictionary() mat = coll.getBOWMatrix() dfm = coll.getDocFreqMap() stopList = ['a', 'and', 'of', 'the', ':', 'totallyirrelevant'] # fixed stoplist stopIds = utils_dml.text2vect(stopList, coll.getDictionary()) stopIds.extend(coll.freqRange( dfm, 0, 2)) # extend stopIds with ids that have 0 <= document frequency < 2 print 'stoplist = ', map( utils_dml.reverseMap(coll.getDictionary()).__getitem__, stopIds) for doc in coll.getDocs(): print "before filter: %s (%i):" % (doc.getId(), len( doc.getTokenIds())), zip(doc.getTokenIds(), doc.getTokens()) coll.filterIds(stopIds) # remove unwanted tokens for doc in coll.getDocs(): print "after filter, before rebuild: %s (%i):" % ( doc.getId(), len(doc.getTokenIds())), zip(doc.getTokenIds(),