def __init__(self, conf=None):
     self.suffix = 'gram'
     self.target = 'tweets'
     self.limit = -1
     self.set_conf(conf)
     
     self.mparser = MecabParser()
Exemple #2
0
 def __init__(self, conf=None, delimiter=","):
     self.delimiter = delimiter
     self.suffix = 'terms'
     self.debug = True
     self.skip = False
     
     self.set_conf(conf)
     
     self.splitter = MecabParser()
Exemple #3
0
class TermExtracter(Plugin):

    def __init__(self, conf=None, delimiter=","):
        self.delimiter = delimiter
        self.suffix = 'terms'
        self.debug = True
        self.skip = False
        
        self.set_conf(conf)
        
        self.splitter = MecabParser()


        
    def execute(self, data):
        #target = data['in']
        target = self.target
        
        out = '%s.%s' %(target, self.suffix)

        data['in'] = out

        if self.skip:
            return data
        
        
        io = open(target, 'r')
        oio = open(out, 'w')
        for i, l in enumerate(io):
            path = l.strip()
            
            print path

            try:
                io2 = open(path, 'r')
            except:
                continue
            
            body = io2.read()
            io2.close()

            if self.debug:
                print body

            ss = self.splitter.split(body)
            terms = ''
            for s in ss:
                for t in s:
                    terms += ' ' + t 

            terms = terms.strip()
            if len(terms) > 0:
                oio.write('%s%s%s\n' %(path, self.delimiter, terms))
            

        oio.close()
        return data
class TMNgramMapperLoader(Plugin):

    def __init__(self, conf=None):
        self.suffix = 'gram'
        self.target = 'tweets'
        self.limit = -1
        self.set_conf(conf)
        
        self.mparser = MecabParser()

    def execute(self, data):
        target = self.target
        


        if self.db:
            db = con[self.db]
            collection = db[self.collection]

        if self.ngram:
            self.suffix = str(self.ngram) + 'gram.mapped'

        out = '%s.%s' %(target, self.suffix)
        data['in'] = out

        n = int(self.ngram)

        limit = -1
        if self.limit:
            limit = self.limit


        if self.skip:
            return data

        count = 0
        oio = open(out, 'w')
        for tw in collection.find():
            tid = tw['id']
            if tw.get('_id'):
                del tw['_id']


            text = tw.get('text')
            text = str(text.replace(self.mr_delimiter, ' '))
            
            # 形態素解析
            ss = self.mparser.split(text)
            
            # sentenceでloop
            for s in ss:
                # morphemeでloop
                l = len(s)
                for i in range(l):
                    for j in range(i+1, l):
                        if j-i == n+1:
                            break

                        ngram = s[i:j]

                        ngram = ' '.join(ngram)
                        #print ' ', ngram
                        oio.write('%s%s1\n' %(ngram, self.mr_delimiter))
                
                        if count > 0 and count % 1000 == 0:
                            print "#", count
                
                    
                        count += 1
                        if limit > 0 and limit < count:
                            oio.close()
                            return data


        oio.close()
        return data