Python Stopwords Beispiele

Programmiersprache: Python

Namespace / Paketname: stopwords

Klasse / Typ: Stopwords

Beispiele auf hotexamples.com: 6

Python Stopwords - 6 Beispiele gefunden. Dies sind die am besten bewerteten Python Beispiele für die stopwords.Stopwords, die aus Open Source-Projekten extrahiert wurden. Sie können Beispiele bewerten, um die Qualität der Beispiele zu verbessern.

Häufig verwendete Methoden

Anzeigen Verbergen

Stopwords(2)

removeStopWords(1)

stopwords(1)

Beispiel #1

Datei anzeigen

def process_resume(email_data):
    path = download_attachment(email_data["attachment_url"])
    text = convert_pdf_to_txt(path)
    text = text.rstrip()
    fp = file("temp.txt", "wb")
    fp.write(text)
    fp.close()
    resume_text_list = getresumecontent("temp.txt")
    resume_text = ';'.join(resume_text_list)
    clean_text = Stopwords.removeStopWords(
        unicode(resume_text, encoding="utf-8"))
    keywords = extract_keywords(resume_text)
    workex = workexfinder("temp.txt")
    lines = workex.split(";;")
    entity_names = []
    string = "".join(lines)
    tokens = nltk.word_tokenize(string)
    tagged = nltk.pos_tag(tokens)
    entities = nltk.chunk.ne_chunk(tagged)
    for entity in entities:
        entity_names.extend(extract_entity_names(entity))
    retval = {}
    retval["keywords"] = keywords
    retval["workex"] = entity_names
    return retval

Beispiel #2

Datei anzeigen

    def _preprocessCaption(self, cap):
        def removeAt(cap):
            # remove @eddie
            end_at = [' ', '\t', '#']
            new_cap = ''
            pre_is_at = False
            for c in cap:
                if c == '@':
                    pre_is_at = True
                    continue

                if pre_is_at == True:
                    if c in end_at:
                        pre_is_at = False

                if pre_is_at == False:
                    new_cap += c

            return new_cap

        cap = removeAt(cap)

        new_cap = ''
        pre_is_cap = False
        for c in cap:
            if c.isupper():
                if not pre_is_cap:
                    new_cap += ' '
                new_cap += c.lower()
                pre_is_cap = True
                continue

            if c.islower():
                new_cap += c
            else:
                new_cap += ' '
            pre_is_cap = False

        words = new_cap.split()
        stopword_list = Stopwords.stopwords()
        tmp_dict = {}

        for word in words:
            word = word.strip()
            if self._stopword_removal and word in stopword_list:
                continue
            if len(word) < 3:
                continue
            if word in tmp_dict.keys():
                tmp_dict[word] = tmp_dict[word] + 1
            else:
                tmp_dict[word] = 1
        return tmp_dict

Beispiel #3

Datei anzeigen

Datei: caption_parser.py Projekt: daifanxiang/CityBeat

	def _preprocessCaption(self, cap):
		def removeAt(cap):
			# remove @eddie
			end_at = [' ', '\t', '#']
			new_cap = ''
			pre_is_at = False
			for c in cap:
				if c =='@':
					pre_is_at = True
					continue
				
				if pre_is_at == True:
					if c in end_at:
						pre_is_at = False
				
				if pre_is_at == False:
					new_cap += c
			
			return new_cap
			
		cap = removeAt(cap)
			
		new_cap = ''
		pre_is_cap = False
		for c in cap:
			if c.isupper():
				if not pre_is_cap:
					new_cap += ' '
				new_cap += c.lower()
				pre_is_cap = True
				continue

			if c.islower():
				new_cap += c
			else:
				new_cap += ' '
			pre_is_cap = False
			 
		words = new_cap.split()
		stopword_list = Stopwords.stopwords()
		tmp_dict = {} 
		
		for word in words:
			word = word.strip()
			if self._stopword_removal and word in stopword_list:
				continue
			if len(word) < 3:
				continue
			if word in tmp_dict.keys():
				tmp_dict[word] = tmp_dict[word] + 1
			else:
				tmp_dict[word] = 1
		return tmp_dict

Beispiel #4

Datei anzeigen

Datei: text_parser.py Projekt: juicyJ/citybeat_online

    def _preprocessText(self, cap):

        new_cap = tool.textPreprocessor(cap)

        words = new_cap.split()
        stopword_list = Stopwords.stopwords()
        tmp_dict = {}

        for word in words:
            word = word.strip()
            if self._stopword_removal and word in stopword_list:
                continue
            if len(word) < 3:
                continue
            if word in tmp_dict.keys():
                tmp_dict[word] = tmp_dict[word] + 1
            else:
                tmp_dict[word] = 1
        return tmp_dict

Beispiel #5

Datei anzeigen

Datei: indexer.py Projekt: alexscott64/ics_search_engine

re_matcher = re.compile("^https?://.*ics.uci.edu")


def get_links(html):
    links = []
    soup = BeautifulSoup(html, "html.parser")
    for link in soup.findAll('a', attrs={'href': re_matcher}):
        links.append(link.get('href'))
    return links


def hasdigit(token):
    return any(c.isdigit() for c in token)


stopw = Stopwords()


def check_token(token):
    return not stopw.is_stop(token) and not hasdigit(
        token) and len(token) > 1 and len(token) < 20


def add_token(token):
    pass


nonalphanum = re.compile("[^0-9a-z']")


def tokenize_text(intext):

Beispiel #6

Datei anzeigen

from zope.component.testing import setUp

from index import Index
from parsers.english import EnglishParser
from splitter import SplitterFactory
from stopwords import Stopwords
from zopyx.txng3.core.interfaces import IParser, IStopwords, IThesaurus
from zopyx.txng3.core.lexicon import LexiconFactory
from zopyx.txng3.core.storage import StorageWithTermFrequencyFactory
from zopyx.txng3.core.thesaurus import GermanThesaurus

# Setup environment
setUp()
provideUtility(SplitterFactory, IFactory, 'txng.splitters.default')
provideUtility(EnglishParser(), IParser, 'txng.parsers.en')
provideUtility(Stopwords(), IStopwords, 'txng.stopwords')
provideUtility(LexiconFactory, IFactory, 'txng.lexicons.default')
provideUtility(StorageWithTermFrequencyFactory, IFactory,
               'txng.storages.default')
provideUtility(GermanThesaurus, IThesaurus, 'txng.thesaurus.de')

try:
    import readline
    histfile = os.path.expanduser('~/.pyhist')
    readline.read_history_file(histfile)
    atexit.register(readline.write_history_file, histfile)
except:
    pass


class Text: