Exemple #1
0
def normalize(text, lang):
	return stemAndRemoveAccents(towords(normalize_text(text)), lang)
Exemple #2
0
def toIndex(documents, stopwords, keylen, lang, elapsed = nothing):
	htmlrem = HTMLRemover()	
	compiledDocuments = []
	docID = 0
	allRealWords = set()
	
	for doc in documents:
		try:
			elapsed('parsing: ' + doc['url'])

			if doc['type'] in ['html', 'txt']:
				if doc['type'] == 'html':
					content = unescapeHTMLEntities(doc['content'])

					try:
						content = htmlrem.getText(content)
					except Exception:
						content = strip_html(content)
					
					title = htmlrem.title
					description = htmlrem.description

					if not title:
						title = os.path.basename(doc['url'])

				if doc['type'] == 'txt':
					content = doc['content']
					title = doc.get('title', os.path.basename(doc['url']))
					description = doc.get('description', '')

				words = getWordsWithoutStopWords(normalize_text(content), stopwords)
				allRealWords |= stripAccents(words)

				if words:
					compiledDocuments.append({
							'pureContent':words,
							'content':stemAndRemoveAccents(words, lang), 
							'title':title,
							'url':doc['url'], 
							'id':docID, 
							'description':description,
							})

					docID += 1
		except Exception as err:
			print('Cannot parse ' + str(doc['url']))
			print(str(err))

	if not compiledDocuments:
		raise Exception('No document parsed')
	
	elapsed('Collecting documents...')
	sitesStats = getDocsStats([x['content'] for x in compiledDocuments])
	
	for doc, wordscount in zip(compiledDocuments, sitesStats['wordscount']):
		doc['words'] = wordscount
	
	index = groupByKeylen(sitesStats['occurences'], keylen)
	
	return {'index': index, 'allwords':sitesStats['allwords'], 
			'documents':compiledDocuments, 'allRealWords':allRealWords}
Exemple #3
0
 def parseQuery(self, query):
     pureQuery = normalize_text(query)
     listQuery = sorted(pureQuery.split())
     return tuple(map(self.normalizeQuery, listQuery))
Exemple #4
0
 def normalizeQuery(self, query):
     return strip_accents(createStem(normalize_text(query)))
Exemple #5
0
	def parseQuery(self, query):
		pureQuery = normalize_text(query)
		listQuery = sorted(pureQuery.split())
		return tuple(map(self.normalizeQuery, listQuery))
Exemple #6
0
	def normalizeQuery(self, query):
		return strip_accents(createStem(normalize_text(query)))
Exemple #7
0
def getstem(word, lang):
    word = normalize_text(word)
    stem = createStem(word, lang)
    stem = strip_accents(stem)
    return stem
Exemple #8
0
	def test_normalize_text(self):
		self.assertEqual('háčky čárky to je věda dva tři', 
						normalize_text('Háčky čárky, to je věda! Dva + Tři = __?'))
Exemple #9
0
def getstem(word, lang):
	word = normalize_text(word)
	stem = createStem(word, lang)
	stem = strip_accents(stem)
	return stem
Exemple #10
0
 def test_normalize_text(self):
     self.assertEqual(
         'háčky čárky to je věda dva tři',
         normalize_text('Háčky čárky, to je věda! Dva + Tři = __?'))
Exemple #11
0
def toIndex(documents, stopwords, keylen, lang, elapsed=nothing):
    htmlrem = HTMLRemover()
    compiledDocuments = []
    docID = 0
    allRealWords = set()

    for doc in documents:
        try:
            elapsed('parsing: ' + doc['url'])

            if doc['type'] in ['html', 'txt']:
                if doc['type'] == 'html':
                    content = unescapeHTMLEntities(doc['content'])

                    try:
                        content = htmlrem.getText(content)
                    except Exception:
                        content = strip_html(content)

                    title = htmlrem.title
                    description = htmlrem.description

                    if not title:
                        title = os.path.basename(doc['url'])

                if doc['type'] == 'txt':
                    content = doc['content']
                    title = doc.get('title', os.path.basename(doc['url']))
                    description = doc.get('description', '')

                words = getWordsWithoutStopWords(normalize_text(content),
                                                 stopwords)
                allRealWords |= stripAccents(words)

                if words:
                    compiledDocuments.append({
                        'pureContent':
                        words,
                        'content':
                        stemAndRemoveAccents(words, lang),
                        'title':
                        title,
                        'url':
                        doc['url'],
                        'id':
                        docID,
                        'description':
                        description,
                    })

                    docID += 1
        except Exception as err:
            print('Cannot parse ' + str(doc['url']))
            print(str(err))

    if not compiledDocuments:
        raise Exception('No document parsed')

    elapsed('Collecting documents...')
    sitesStats = getDocsStats([x['content'] for x in compiledDocuments])

    for doc, wordscount in zip(compiledDocuments, sitesStats['wordscount']):
        doc['words'] = wordscount

    index = groupByKeylen(sitesStats['occurences'], keylen)

    return {
        'index': index,
        'allwords': sitesStats['allwords'],
        'documents': compiledDocuments,
        'allRealWords': allRealWords
    }