Beispiel #1
0
	def _preprocessCaption(self, cap):
		def removeAt(cap):
			# remove @eddie
			end_at = [' ', '\t', '#']
			new_cap = ''
			pre_is_at = False
			for c in cap:
				if c =='@':
					pre_is_at = True
					continue
				
				if pre_is_at == True:
					if c in end_at:
						pre_is_at = False
				
				if pre_is_at == False:
					new_cap += c
			
			return new_cap
			
		cap = removeAt(cap)
			
		new_cap = ''
		pre_is_cap = False
		for c in cap:
			if c.isupper():
				if not pre_is_cap:
					new_cap += ' '
				new_cap += c.lower()
				pre_is_cap = True
				continue

			if c.islower():
				new_cap += c
			else:
				new_cap += ' '
			pre_is_cap = False
			 
		words = new_cap.split()
		stopword_list = Stopwords.stopwords()
		tmp_dict = {} 
		
		for word in words:
			word = word.strip()
			if self._stopword_removal and word in stopword_list:
				continue
			if len(word) < 3:
				continue
			if word in tmp_dict.keys():
				tmp_dict[word] = tmp_dict[word] + 1
			else:
				tmp_dict[word] = 1
		return tmp_dict
Beispiel #2
0
    def _preprocessCaption(self, cap):
        def removeAt(cap):
            # remove @eddie
            end_at = [' ', '\t', '#']
            new_cap = ''
            pre_is_at = False
            for c in cap:
                if c == '@':
                    pre_is_at = True
                    continue

                if pre_is_at == True:
                    if c in end_at:
                        pre_is_at = False

                if pre_is_at == False:
                    new_cap += c

            return new_cap

        cap = removeAt(cap)

        new_cap = ''
        pre_is_cap = False
        for c in cap:
            if c.isupper():
                if not pre_is_cap:
                    new_cap += ' '
                new_cap += c.lower()
                pre_is_cap = True
                continue

            if c.islower():
                new_cap += c
            else:
                new_cap += ' '
            pre_is_cap = False

        words = new_cap.split()
        stopword_list = Stopwords.stopwords()
        tmp_dict = {}

        for word in words:
            word = word.strip()
            if self._stopword_removal and word in stopword_list:
                continue
            if len(word) < 3:
                continue
            if word in tmp_dict.keys():
                tmp_dict[word] = tmp_dict[word] + 1
            else:
                tmp_dict[word] = 1
        return tmp_dict
Beispiel #3
0
    def _preprocessText(self, cap):

        new_cap = tool.textPreprocessor(cap)

        words = new_cap.split()
        stopword_list = Stopwords.stopwords()
        tmp_dict = {}

        for word in words:
            word = word.strip()
            if self._stopword_removal and word in stopword_list:
                continue
            if len(word) < 3:
                continue
            if word in tmp_dict.keys():
                tmp_dict[word] = tmp_dict[word] + 1
            else:
                tmp_dict[word] = 1
        return tmp_dict