class IndexedFeature(object):
    def __init__(self):
        self.indexer = Indexer()

    def freeze(self):
        self.indexer.freeze()

    def vectorize(self, text):

        # better filter maybe
        texts = text.split("\n")
        texts = filter(lambda t: not re.search(r"\b(I am|I'm|Im) a (man|woman)\b", t, re.I), texts)

        text = "\n".join(texts)

        # tokenize and remove stoplist
        words_arr = re.findall(r"\w+", text)
        words_arr = removeStopList(words_arr)

        # normalize
        length = len(words_arr)
        context = Counter(words_arr).most_common()
        context = map(lambda x: (x[0], x[1] / float(length)), context)

        return context

    def number(self, context):
        """ convert text to feature array """
        idx_context = map(lambda t: (self.indexer.index(t[0]), t[1]), context)
        idx_context = filter(lambda t: t[0] >= 0, idx_context)
        idx_context = sorted(idx_context, key=itemgetter(0))
        return idx_context

    def text2feature(self, text):
        return self.number(self.vectorize(text))
Esempio n. 2
0
class IndexedFeature(object):
    def __init__(self):
        self.indexer = Indexer()

    def freeze(self):
        self.indexer.freeze()

    def vectorize(self, text):

        # better filter maybe
        texts = text.split('\n')
        #texts = filter(lambda t: not re.search('year', t), texts)
        #texts = filter(lambda t: not re.search('old', t), texts)
        texts = filter(lambda t: not re.search(r'\d{2} years old', t), texts)

        text = '\n'.join(texts)

        # Replace Twitter specific patterns.
        #text = re.sub(r'https?:\S+', ' ', text)
        #text = re.sub(r'\b\d{2} years old', ' ', text)
        #text = re.sub(r'old', ' ', text)
        #text = re.sub(r'years', ' ', text)
        #text = re.sub(r'\d+', ' ', text)
        #text = re.sub(r'#\w+', ' ', text)
        #text = re.sub(r'@\w+', ' ', text)

        # tokenize and remove stoplist
        words_arr = re.findall(r'\w+', text)
        words_arr = removeStopList(words_arr)

        # normalize
        length = len(words_arr)
        context = Counter(words_arr).most_common()
        context = map(lambda x: (x[0], x[1] / float(length)), context)

        return context

    def number(self, context):
        ''' convert text to feature array '''
        idx_context = map(lambda t: (self.indexer.index(t[0]), t[1]), context)
        idx_context = filter(lambda t: t[0] >= 0, idx_context)
        idx_context = sorted(idx_context, key=itemgetter(0))
        return idx_context

    def text2feature(self, text):
        return self.number(self.vectorize(text))
class IndexedContext(object):

    def __init__(self):
        self.indexer = Indexer()
        self.count = 0


    def freeze(self):
        self.indexer.freeze()


    def getIndexer(self):
        return self.indexer


    def setIndexer(self, indexer):
        self.indexer = indexer


    def loadIndexedContext(self, user_id):
        context = loadContext(user_id)
        idx_context = map(lambda t: (self.indexer.index(t[0]), t[1]), context)
        idx_context = filter(lambda t: t[0], idx_context)
        idx_context = sorted(idx_context, key=itemgetter(0))
        return idx_context


    def loadIndexedContexts(self, users):
        contexts = map(lambda u: self.loadIndexedContext(u), users)
        return contexts


    def libsvmString(self, context, label):
        ''' convert the loaded context into libsvm context. '''
        row = str(label)
        for f, v in context:    # (feature, value)
            row += ' %s:%s' % (f, v)
        row += '\n'
        return row


    def loadLibsvmString(self, user_id, label):
        return self.libsvmString(self.loadIndexedContext(user_id), label)


    def writeToFile(self, output_file, text):
        fout = open(output_file, 'a')
        fout.write(text)
        fout.close()


    def writeId2File(self, output_uid, user_id):
        fout = open(output_uid, 'a')
        fout.write("%s\n" % user_id)
        fout.close()


    def tick_counter(self):
        self.count += 1

        if self.count % 1000 == 0:
            print "%d numbers processed" % self.count
        elif self.count % 100 == 0:
            sys.stdout.write('.')
            sys.stdout.flush()


    def processUser(self, arg):
        ''' arg = (user_id, label) '''
        user_id, label = arg
        output_file = self.output_libsvm
        output_uid = self.output_uid

        try:
            row = self.loadLibsvmString(user_id, label)
        except UserNotFoundError:
            return False
        finally:
            #self.tick_counter()
            self.pbar.update(self.pbar_idx)
            self.pbar_idx += 1

        self.writeToFile(output_file, row)
        self.writeId2File(output_uid, user_id)


        return True


    #def processFile(self, input_file, output_file):#{{{
    #    assert(os.path.exists(input_file))
    #    fout = open(output_file, 'w')
    #    fout.close()

    #    # Load users to process
    #    params = []
    #    for line in open(input_file):
    #        user_id, label = line.rstrip('\n').split('\t')
    #        params.append((user_id, label, output_file))

    #    # Run!!
    #    results = map(lambda p: self.processUser(p), params)
    #    print 'finished processing %d out of %d' % (
    #            sum(results), len(results))#}}}


    def processPairs(self, pairs, output_libsvm, output_uid):
        # convert the format
        params = [(p[0], p[1]) for p in pairs]

        self.output_libsvm = output_libsvm
        self.output_uid = output_uid

        # Run!!
        self.pbar = ProgressBar(widgets=[Percentage(), Bar()],
                maxval=len(params)).start()
        self.pbar_idx = 0

        ## Main Loop
        results = map(lambda p: self.processUser(p), params)

        self.pbar.finish()
        print 'finished processing %d out of %d' % (
                sum(results), len(results))
class IndexedContext(object):

    def __init__(self):
        self.indexer = Indexer()
        self.count = 0


    def freeze(self):
        self.indexer.freeze()


    def getIndexer(self):
        return self.indexer


    def setIndexer(self, indexer):
        self.indexer = indexer


    def loadIndexedContext(self, user_id):
        context = loadContext(user_id)
        idx_context = map(lambda t: (self.indexer.index(t[0]), t[1]), context)
        idx_context = filter(lambda t: t[0], idx_context)
        idx_context = sorted(idx_context, key=itemgetter(0))
        return idx_context


    def loadIndexedContexts(self, users):
        contexts = map(lambda u: self.loadIndexedContext(u), users)
        return contexts


    def libsvmString(self, context, label):
        ''' convert the loaded context into libsvm context. '''
        row = str(label)
        for f, v in context:    # (feature, value)
            row += ' %s:%s' % (f, v)
        row += '\n'
        return row


    def loadLibsvmString(self, user_id, label):
        return self.libsvmString(self.loadIndexedContext(user_id), label)


    def writeToFile(self, output_file, text):
        fout = open(output_file, 'a')
        fout.write(text)
        fout.close()


    def tick_counter(self):
        self.count += 1

        if self.count % 1000 == 0:
            print "%d numbers processed" % self.count
        elif self.count % 100 == 0:
            sys.stdout.write('.')
            sys.stdout.flush()


    def processUser(self, arg):
        ''' arg = (user_id, label, output_file) '''
        user_id, label, output_file = arg
        try:
            row = self.loadLibsvmString(user_id, label)
        except UserNotFoundError:
            return False
        finally:
            self.tick_counter()

        self.writeToFile(output_file, row)
        return True


    def processFile(self, input_file, output_file):
        assert(os.path.exists(input_file))
        fout = open(output_file, 'w')
        fout.close()

        # Load users to process
        params = []
        for line in open(input_file):
            user_id, label = line.rstrip('\n').split('\t')
            params.append((user_id, label, output_file))

        # Run!!
        results = map(lambda p: self.processUser(p), params)
        print 'finished processing %d out of %d' % (
                sum(results), len(results))