Example #1
0
 def PDF(self, url, enc='UTF-8'):
     tempfile = TEMP_FOLDER + "temp."
     pdfdest = tempfile + "pdf"
     txtdest = tempfile + "txt"
     downloadFile(url, pdfdest)
     os.system(PDFTOTEXT + "-enc " + enc + " " + pdfdest + " " + txtdest)
     txt = readfile(txtdest)
     txt = normalizePDF(txt)
     return txt
Example #2
0
def tempSearch(path):
	content = readfile(path)
	data = loads(content);
	options = data.get('options', {})
	lang = options.get('lang', 'en')
	tempIndex = TempSearch()
	index = tempIndex.build(data, getStopWords(lang))
	res=tempSearchQuery(index, options.get('query', 'lion'), {}, lang)
	return getJson(res)
Example #3
0
	def PDF(self, url, enc = 'UTF-8'):
		tempfile = TEMP_FOLDER + "temp."
		pdfdest = tempfile + "pdf"
		txtdest = tempfile + "txt"
		downloadFile(url, pdfdest)
		os.system(PDFTOTEXT + "-enc " + enc + " " + pdfdest + " " + txtdest)
		txt = readfile(txtdest)
		txt = normalizePDF(txt)
		return txt
Example #4
0
def buildIndex(databaseName, linksSourcePath, currSettings, lang):
	settings = Settings(DATA_FOLDER + SETTINGS_FILE)
	for key, value in currSettings.items():
		settings.set(key, value)

	database = DATABASES_FOLDER + databaseName + '/'
	links = readfile(linksSourcePath).splitlines()
	indexManager = IndexManager(settings)
	indexManager.shutUp = False
	indexManager.build(links, database, getStopWords(lang), lang)
Example #5
0
    def test_toIndex(self):
        urls = self.getURLs()
        sites = downloads(urls)
        sites = [{'type': 'html', 'content': x, 'url': 'url'} for x in sites]

        # savefile(repr(toIndex(sites, [], 1)), TEST_FOLDER + 'index1.txt')
        # savefile(repr(toIndex(sites, getStopWords(), 1)), TEST_FOLDER + 'index2.txt')
        # savefile(repr(toIndex(sites, getStopWords(), 2)), TEST_FOLDER + 'index3.txt')

        result = toIndex(sites, [], 1)
        desired = readfile(TEST_FOLDER + 'index1.txt')
        self.assertEqual(repr(result), desired)

        result = toIndex(sites, getStopWords(), 1)
        desired = readfile(TEST_FOLDER + 'index2.txt')
        self.assertEqual(repr(result), desired)

        result = toIndex(sites, getStopWords(), 2)
        desired = readfile(TEST_FOLDER + 'index3.txt')
        self.assertEqual(repr(result), desired)
Example #6
0
	def testSearchCommand(self):
		path = TEST_FOLDER 
		fun = self.runShell
		dfun = lambda dtb, q: fun(dtb, q).decode("utf-8")
		save = lambda cont, name: savefile(cont, path + name + '.txt', False)
		read = lambda name: readfile(path + name + '.txt')
		ass = self.assertEqual
		
		# search matweb
		queries = ['derivace', 'nesmysl', '(spocetne OR nespocetne) mnoziny', 'rovnice', 'rovnice NOT (linearni OR pravdepodobnost)']

		ass(dfun('matweb-test', 'derivace'), read('matweb0'))
		ass(dfun('matweb-test', 'nesmysl'), read('matweb1'))
		ass(dfun('matweb-test', '(spocetne OR nespocetne) mnoziny'), read('matweb2'))
		# ass(dfun('matweb-test', 'rovnice'), read('matweb3'))
		ass(dfun('matweb-test', 'rovnice NOT (linearni OR pravdepodobnost)'), read('matweb4'))
Example #7
0
    def testSearchCommand(self):
        path = TEST_FOLDER
        fun = self.runShell
        dfun = lambda dtb, q: fun(dtb, q).decode("utf-8")
        save = lambda cont, name: savefile(cont, path + name + '.txt', False)
        read = lambda name: readfile(path + name + '.txt')
        ass = self.assertEqual

        # search matweb
        queries = [
            'derivace', 'nesmysl', '(spocetne OR nespocetne) mnoziny',
            'rovnice', 'rovnice NOT (linearni OR pravdepodobnost)'
        ]

        ass(dfun('matweb-test', 'derivace'), read('matweb0'))
        ass(dfun('matweb-test', 'nesmysl'), read('matweb1'))
        ass(dfun('matweb-test', '(spocetne OR nespocetne) mnoziny'),
            read('matweb2'))
        # ass(dfun('matweb-test', 'rovnice'), read('matweb3'))
        ass(dfun('matweb-test', 'rovnice NOT (linearni OR pravdepodobnost)'),
            read('matweb4'))
Example #8
0
def getStopWords(lang):
	return _cache_result('stopwords', lambda: set(readfile(DATA_FOLDER + STOPWORDS_NAME + "." + lang + ".txt").split()))
Example #9
0
 def loadData(self):
     content = readfile(DATA_FOLDER + 'matwebsearches.txt')
     self.pureData = content.splitlines()[:self.maxQueries]
Example #10
0
	def loadData(self):
		content = readfile(DATA_FOLDER + 'matwebsearches.txt')
		self.pureData = content.splitlines()[:self.maxQueries]
Example #11
0
	def loadSettings(self, path):
		self.text = readfile(path)
		self.settings = json.loads(self.text)
Example #12
0
	def getURLs(cls):
		path = DATA_FOLDER + 'test.txt'
		content = readfile(path).splitlines()
		return content