def createDocument(self, source, docid = None, dictionary = None):
        """
        source - either text as string or filename (if sourceType=='file')
        docid - document id or filename
        """
        if self.sourceType == 'file':
            if docid == None:
                docid = source
##                docid = os.path.basename(source)
            source = utils_dml.readfile(source)
##        logging.debug("creating document %s" % str(docid))
        result = Document(docid)
        if self.keepTexts:
            result.setText(source)
        if self.keepTokens or self.keepPositions or dictionary != None:
            if self.keepPositions:
                tokens, pos = self.tokenizer.tokenize(source, returnPositions = self.keepPositions)
            else:
                tokens = self.tokenizer.tokenize(source, returnPositions = self.keepPositions)
            if self.keepTokens:
                result.setTokens(tokens)
            if self.keepPositions:
                result.setTokenPositions(pos)
            if dictionary != None:
                newwords = {}
                result.setTokenIds(utils_dml.text2vect(tokens, dictionary, newwords))
##                print 'for %s added %i (new length = %i) ' % (docid, len(newwords), len(dictionary))
        return result
    def createDocument(self, source, docid=None, dictionary=None):
        """
        source - either text as string or filename (if sourceType=='file')
        docid - document id or filename
        """
        if self.sourceType == 'file':
            if docid == None:
                docid = source
##                docid = os.path.basename(source)
            source = utils_dml.readfile(source)
##        logging.debug("creating document %s" % str(docid))
        result = Document(docid)
        if self.keepTexts:
            result.setText(source)
        if self.keepTokens or self.keepPositions or dictionary != None:
            if self.keepPositions:
                tokens, pos = self.tokenizer.tokenize(
                    source, returnPositions=self.keepPositions)
            else:
                tokens = self.tokenizer.tokenize(
                    source, returnPositions=self.keepPositions)
            if self.keepTokens:
                result.setTokens(tokens)
            if self.keepPositions:
                result.setTokenPositions(pos)
            if dictionary != None:
                newwords = {}
                result.setTokenIds(
                    utils_dml.text2vect(tokens, dictionary, newwords))
##                print 'for %s added %i (new length = %i) ' % (docid, len(newwords), len(dictionary))
        return result
 def createDictionary(self):
     tokens = []
     for doc in self.getDocs():
         doctokens = doc.getTokens()
         if doctokens == None:
             logging.error("DocumentCollection.createDictionary called but keepTokens is False for %s" % doc.getId())
             return {}
         tokens.extend(doctokens)
     self.dictionary = utils_dml.text2dict(tokens)
     for doc in self.getDocs():
         doc.setTokenIds(utils_dml.text2vect(doc.getTokens(), self.dictionary))
Example #4
0
 def createDictionary(self):
     tokens = []
     for doc in self.getDocs():
         doctokens = doc.getTokens()
         if doctokens == None:
             logging.error(
                 "DocumentCollection.createDictionary called but keepTokens is False for %s"
                 % doc.getId())
             return {}
         tokens.extend(doctokens)
     self.dictionary = utils_dml.text2dict(tokens)
     for doc in self.getDocs():
         doc.setTokenIds(
             utils_dml.text2vect(doc.getTokens(), self.dictionary))
    coll.removeDocument('m1')
    coll.addDocument(df.createDocument(".", "empty_doc"))
    coll.addDocument(df.createDocument("minors graph eps trees system computer survey user human time interface response.", "full_doc"))
    if not coll.docExists('m1'):
        coll.addDocument(df.createDocument(texts['m1'], 'brekeke'))
        
    coll.createDictionary()
    
    for doc in coll.getDocs():
        print "dc1: %s (%i):" % (doc.getId(), len(doc.getTokenIds())), doc.getTokenIds()
    print coll.getDictionary()
    
    mat = coll.getBOWMatrix()
    dfm = coll.getDocFreqMap()
    stopList = ['a','and','of','the',':', 'totallyirrelevant'] # fixed stoplist
    stopIds = utils_dml.text2vect(stopList, coll.getDictionary())
    stopIds.extend(coll.freqRange(dfm, 0, 2)) # extend stopIds with ids that have 0 <= document frequency < 2
    print 'stoplist = ', map(utils_dml.reverseMap(coll.getDictionary()).__getitem__, stopIds)

    for doc in coll.getDocs():
        print "before filter: %s (%i):" % (doc.getId(), len(doc.getTokenIds())), zip(doc.getTokenIds(), doc.getTokens())
        
    coll.filterIds(stopIds) # remove unwanted tokens
    
    for doc in coll.getDocs():
        print "after filter, before rebuild: %s (%i):" % (doc.getId(), len(doc.getTokenIds())), zip(doc.getTokenIds(), doc.getTokens())
        
    coll.rebuildDictionary()
    for doc in coll.getDocs():
        print "after rebuild: %s (%i):" % (doc.getId(), len(doc.getTokenIds())), zip(doc.getTokenIds(), doc.getTokens())
        
Example #6
0
            "full_doc"))
    if not coll.docExists('m1'):
        coll.addDocument(df.createDocument(texts['m1'], 'brekeke'))

    coll.createDictionary()

    for doc in coll.getDocs():
        print "dc1: %s (%i):" % (doc.getId(), len(
            doc.getTokenIds())), doc.getTokenIds()
    print coll.getDictionary()

    mat = coll.getBOWMatrix()
    dfm = coll.getDocFreqMap()
    stopList = ['a', 'and', 'of', 'the', ':',
                'totallyirrelevant']  # fixed stoplist
    stopIds = utils_dml.text2vect(stopList, coll.getDictionary())
    stopIds.extend(coll.freqRange(
        dfm, 0,
        2))  # extend stopIds with ids that have 0 <= document frequency < 2
    print 'stoplist = ', map(
        utils_dml.reverseMap(coll.getDictionary()).__getitem__, stopIds)

    for doc in coll.getDocs():
        print "before filter: %s (%i):" % (doc.getId(), len(
            doc.getTokenIds())), zip(doc.getTokenIds(), doc.getTokens())

    coll.filterIds(stopIds)  # remove unwanted tokens

    for doc in coll.getDocs():
        print "after filter, before rebuild: %s (%i):" % (
            doc.getId(), len(doc.getTokenIds())), zip(doc.getTokenIds(),