Python Indexer Examples, dpIndexer.Indexer Python Examples

Example #1

0

Show file

File: text2feature_gender.py Project: pyongjoo/twitter-research

class IndexedFeature(object):
    def __init__(self):
        self.indexer = Indexer()

    def freeze(self):
        self.indexer.freeze()

    def vectorize(self, text):

        # better filter maybe
        texts = text.split("\n")
        texts = filter(lambda t: not re.search(r"\b(I am|I'm|Im) a (man|woman)\b", t, re.I), texts)

        text = "\n".join(texts)

        # tokenize and remove stoplist
        words_arr = re.findall(r"\w+", text)
        words_arr = removeStopList(words_arr)

        # normalize
        length = len(words_arr)
        context = Counter(words_arr).most_common()
        context = map(lambda x: (x[0], x[1] / float(length)), context)

        return context

    def number(self, context):
        """ convert text to feature array """
        idx_context = map(lambda t: (self.indexer.index(t[0]), t[1]), context)
        idx_context = filter(lambda t: t[0] >= 0, idx_context)
        idx_context = sorted(idx_context, key=itemgetter(0))
        return idx_context

    def text2feature(self, text):
        return self.number(self.vectorize(text))

Example #2

0

Show file

File: genFeautreSet.py Project: pyongjoo/twitter-research

class FollowingFeatureConverter(FeatureConverter):

    def __init__(self, getFollowing = None):

        assert(getFollowing is not None)

        self.getFollowing = getFollowing
        self.indexer = Indexer()


    def passIndexer(self, feature_arr):
        '''
        @param feature_arr    This is a list of (predicate, weight)
        '''

        def _indexed(feature_weight):
            feature, weight = feature_weight
            return (self.indexer.index(feature), weight)

        return map(_indexed, feature_arr)


    def convertSingle(self, user_id):

        user_id = str(user_id)

        followings = self.getFollowing(user_id)

        if followings is None:
            return None

        feature_arr = map(lambda x: (x, 1), followings)
        feature_arr = self.passIndexer(feature_arr)

        return sorted(feature_arr, key = itemgetter(0))

Example #3

0

Show file

File: text2feature.py Project: pyongjoo/twitter-research

class IndexedFeature(object):
    def __init__(self):
        self.indexer = Indexer()

    def freeze(self):
        self.indexer.freeze()

    def vectorize(self, text):

        # better filter maybe
        texts = text.split('\n')
        #texts = filter(lambda t: not re.search('year', t), texts)
        #texts = filter(lambda t: not re.search('old', t), texts)
        texts = filter(lambda t: not re.search(r'\d{2} years old', t), texts)

        text = '\n'.join(texts)

        # Replace Twitter specific patterns.
        #text = re.sub(r'https?:\S+', ' ', text)
        #text = re.sub(r'\b\d{2} years old', ' ', text)
        #text = re.sub(r'old', ' ', text)
        #text = re.sub(r'years', ' ', text)
        #text = re.sub(r'\d+', ' ', text)
        #text = re.sub(r'#\w+', ' ', text)
        #text = re.sub(r'@\w+', ' ', text)

        # tokenize and remove stoplist
        words_arr = re.findall(r'\w+', text)
        words_arr = removeStopList(words_arr)

        # normalize
        length = len(words_arr)
        context = Counter(words_arr).most_common()
        context = map(lambda x: (x[0], x[1] / float(length)), context)

        return context

    def number(self, context):
        ''' convert text to feature array '''
        idx_context = map(lambda t: (self.indexer.index(t[0]), t[1]), context)
        idx_context = filter(lambda t: t[0] >= 0, idx_context)
        idx_context = sorted(idx_context, key=itemgetter(0))
        return idx_context

    def text2feature(self, text):
        return self.number(self.vectorize(text))

Example #4

0

Show file

File: genFeautreSet.py Project: pyongjoo/twitter-research

    def __init__(self, getText = None, stopfile = None):
        '''
        @param getText  a function which gets user_id as a paramter and returns
                        a tuple (text, length), where length means the number of
                        the statuses crawled which is used to normalized the
                        text later.
        @param stopfile A file which includes a list of stop words. A default
                        file is used if not mentioned.
        '''
        super(TextFeatureConverter, self).__init__()    # if any

        self.getText = getText
        self.indexer = Indexer()

        # Process stop list
        self.stoplist = {}

        if stopfile is None:
            stopfile = '../../data/stoplist.txt'

        stop = open(stopfile)
        for line in stop:
            line = line.rstrip('\n')
            self.stoplist[line] = True

Example #5

0

Show file

File: tweets2GenderSvmlight.py Project: pyongjoo/twitter-research

 def __init__(self):
     self.indexer = Indexer()
     self.count = 0

Example #6

0

Show file

File: tweets2GenderSvmlight.py Project: pyongjoo/twitter-research

class IndexedContext(object):

    def __init__(self):
        self.indexer = Indexer()
        self.count = 0


    def freeze(self):
        self.indexer.freeze()


    def getIndexer(self):
        return self.indexer


    def setIndexer(self, indexer):
        self.indexer = indexer


    def loadIndexedContext(self, user_id):
        context = loadContext(user_id)
        idx_context = map(lambda t: (self.indexer.index(t[0]), t[1]), context)
        idx_context = filter(lambda t: t[0], idx_context)
        idx_context = sorted(idx_context, key=itemgetter(0))
        return idx_context


    def loadIndexedContexts(self, users):
        contexts = map(lambda u: self.loadIndexedContext(u), users)
        return contexts


    def libsvmString(self, context, label):
        ''' convert the loaded context into libsvm context. '''
        row = str(label)
        for f, v in context:    # (feature, value)
            row += ' %s:%s' % (f, v)
        row += '\n'
        return row


    def loadLibsvmString(self, user_id, label):
        return self.libsvmString(self.loadIndexedContext(user_id), label)


    def writeToFile(self, output_file, text):
        fout = open(output_file, 'a')
        fout.write(text)
        fout.close()


    def writeId2File(self, output_uid, user_id):
        fout = open(output_uid, 'a')
        fout.write("%s\n" % user_id)
        fout.close()


    def tick_counter(self):
        self.count += 1

        if self.count % 1000 == 0:
            print "%d numbers processed" % self.count
        elif self.count % 100 == 0:
            sys.stdout.write('.')
            sys.stdout.flush()


    def processUser(self, arg):
        ''' arg = (user_id, label) '''
        user_id, label = arg
        output_file = self.output_libsvm
        output_uid = self.output_uid

        try:
            row = self.loadLibsvmString(user_id, label)
        except UserNotFoundError:
            return False
        finally:
            #self.tick_counter()
            self.pbar.update(self.pbar_idx)
            self.pbar_idx += 1

        self.writeToFile(output_file, row)
        self.writeId2File(output_uid, user_id)


        return True


    #def processFile(self, input_file, output_file):#{{{
    #    assert(os.path.exists(input_file))
    #    fout = open(output_file, 'w')
    #    fout.close()

    #    # Load users to process
    #    params = []
    #    for line in open(input_file):
    #        user_id, label = line.rstrip('\n').split('\t')
    #        params.append((user_id, label, output_file))

    #    # Run!!
    #    results = map(lambda p: self.processUser(p), params)
    #    print 'finished processing %d out of %d' % (
    #            sum(results), len(results))#}}}


    def processPairs(self, pairs, output_libsvm, output_uid):
        # convert the format
        params = [(p[0], p[1]) for p in pairs]

        self.output_libsvm = output_libsvm
        self.output_uid = output_uid

        # Run!!
        self.pbar = ProgressBar(widgets=[Percentage(), Bar()],
                maxval=len(params)).start()
        self.pbar_idx = 0

        ## Main Loop
        results = map(lambda p: self.processUser(p), params)

        self.pbar.finish()
        print 'finished processing %d out of %d' % (
                sum(results), len(results))

Example #7

0

Show file

File: gender_process_texts_libsvm.py Project: pyongjoo/twitter-research

class IndexedContext(object):

    def __init__(self):
        self.indexer = Indexer()
        self.count = 0


    def freeze(self):
        self.indexer.freeze()


    def getIndexer(self):
        return self.indexer


    def setIndexer(self, indexer):
        self.indexer = indexer


    def loadIndexedContext(self, user_id):
        context = loadContext(user_id)
        idx_context = map(lambda t: (self.indexer.index(t[0]), t[1]), context)
        idx_context = filter(lambda t: t[0], idx_context)
        idx_context = sorted(idx_context, key=itemgetter(0))
        return idx_context


    def loadIndexedContexts(self, users):
        contexts = map(lambda u: self.loadIndexedContext(u), users)
        return contexts


    def libsvmString(self, context, label):
        ''' convert the loaded context into libsvm context. '''
        row = str(label)
        for f, v in context:    # (feature, value)
            row += ' %s:%s' % (f, v)
        row += '\n'
        return row


    def loadLibsvmString(self, user_id, label):
        return self.libsvmString(self.loadIndexedContext(user_id), label)


    def writeToFile(self, output_file, text):
        fout = open(output_file, 'a')
        fout.write(text)
        fout.close()


    def tick_counter(self):
        self.count += 1

        if self.count % 1000 == 0:
            print "%d numbers processed" % self.count
        elif self.count % 100 == 0:
            sys.stdout.write('.')
            sys.stdout.flush()


    def processUser(self, arg):
        ''' arg = (user_id, label, output_file) '''
        user_id, label, output_file = arg
        try:
            row = self.loadLibsvmString(user_id, label)
        except UserNotFoundError:
            return False
        finally:
            self.tick_counter()

        self.writeToFile(output_file, row)
        return True


    def processFile(self, input_file, output_file):
        assert(os.path.exists(input_file))
        fout = open(output_file, 'w')
        fout.close()

        # Load users to process
        params = []
        for line in open(input_file):
            user_id, label = line.rstrip('\n').split('\t')
            params.append((user_id, label, output_file))

        # Run!!
        results = map(lambda p: self.processUser(p), params)
        print 'finished processing %d out of %d' % (
                sum(results), len(results))

Example #8

0

Show file

File: genFeautreSet.py Project: pyongjoo/twitter-research

    def __init__(self, getFollowing = None):

        assert(getFollowing is not None)

        self.getFollowing = getFollowing
        self.indexer = Indexer()

Example #9

0

Show file

File: genFeautreSet.py Project: pyongjoo/twitter-research

class TextFeatureConverter(FeatureConverter):
    '''
    The indexer used to map a given word to a feature number is maintained even
    after converting a series of users, thus on the successive calls of the
    function 'convert', it is expected to return the consistent feature mapping.
    '''

    def __init__(self, getText = None, stopfile = None):
        '''
        @param getText  a function which gets user_id as a paramter and returns
                        a tuple (text, length), where length means the number of
                        the statuses crawled which is used to normalized the
                        text later.
        @param stopfile A file which includes a list of stop words. A default
                        file is used if not mentioned.
        '''
        super(TextFeatureConverter, self).__init__()    # if any

        self.getText = getText
        self.indexer = Indexer()

        # Process stop list
        self.stoplist = {}

        if stopfile is None:
            stopfile = '../../data/stoplist.txt'

        stop = open(stopfile)
        for line in stop:
            line = line.rstrip('\n')
            self.stoplist[line] = True


    def writeIndexed(self, fout_name):
        fout = open(fout_name, 'w')
        fout.write(str(self.indexer))
        fout.close()


    def textToSparseTokens(self, text, pass_indexer = True):
        '''
        Given a long text, we convert it to a list of words (or features).

        The format should be like this:
            [(word, count), (word, count), ...]

        I.e., they are unnormalized feature list.
        '''

        def _removeStoplist(words_arr):
            new_words_arr = []

            for word in words_arr:
                if not word.lower() in self.stoplist:
                    new_words_arr.append(word)

            return new_words_arr

        def _replacePattern(text):
            #text = re.sub(r'#\w+', 'tag_pattern', text)
            #text = re.sub(r'\b\d+\b', ' ', text)
            #text = re.sub(r'@\w+', 'mention_pattern', text)
            #text = re.sub(r'https?:\S+', 'url_pattern', text)
            return text

        words_arr = re.findall(r'\w+', _replacePattern(text))
        words_arr = map(lambda x: x.lower(), words_arr)
        words_arr = _removeStoplist(words_arr)

        #if pass_indexer:    # this conditioning added to handle a special case
        #                    # needed by passWeightCoeff module
        #    words_arr = map(self.indexer.index, words_arr)

        c = Counter(words_arr)

        return c.most_common()


    def normarlizeWordsArr(self, words_arr, length):

        def _single(pair):
            return (pair[0], float(pair[1]) / length)

        return map(_single, words_arr)


    def additionalPass(self, words_arr, user_id, semi_label):
        ''' subclass may want to override this method to add functionality, like
        changing the weight of features. '''
        return words_arr


    def passIndexer(self, words_arr):
        '''
        @param words_arr    This is a list of (word, weight)
        '''

        def _indexed(word_weight):
            word, weight = word_weight
            return (self.indexer.index(word), weight)

        return map(_indexed, words_arr)


    def convertSingle(self, user_id, semi_label = None):
        '''
        This method is called by 'writeFeatures' method repeatedly for each
        user_id encountered.
        '''
        user_id = str(user_id)
        mytext, length = self.getText(user_id)

        if mytext is None:      # when something's wrong
            return None

        words_arr = self.textToSparseTokens(mytext)
        words_arr = self.normarlizeWordsArr(words_arr, length)
        words_arr = self.additionalPass(words_arr, user_id, semi_label) # hook
        words_arr = self.passIndexer(words_arr)

        return sorted(words_arr, key=itemgetter(0))

Example #10

0

Show file

File: text2feature_gender.py Project: pyongjoo/twitter-research

 def __init__(self):
     self.indexer = Indexer()