Esempio n. 1
0
def main():
    param = sys.argv[1:]

    commit = param[0]
    dir_projeto = param[1]
    dir_result = param[2]
    localpath = param[3]
    nome_projeto = param[4]
  
    parser(commit, dir_projeto, dir_result, localpath, nome_projeto)
Esempio n. 2
0
 def get_jobs(self,job_type):
     for item in job_type:
         self.scroll_to_top()
         self.wait_for_element(locators['job_option'])
         select = Select(self.find_element_by_locator(locators['job_option']))
         select.select_by_visible_text(item)
         try:
             self.find_element_by_locator(locators['image']).send_keys(Keys.ENTER)
         except:
             self.find_element_by_locator(locators['image1']).send_keys(Keys.ENTER)
         try:
             self.wait_for_element(locators['caret'])
             self.scroll_to_bottom()
             self.click_button(locators['caret'])
         #self.driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
         
         # Used a loop to list all items of the job listing
             for i in range(1,1000):
                 try:
                     self.wait_for_element("xpath=//*[@id=\"ng-app\"]/div[2]/div/div[3]/div[1]/div[2]/div/div/ul/li["+str(i)+"]/a")
                 except:
                     i=i-1
                     log.info("Not Found_element_SHOW_ALL"+ str(i))
                     self.scroll_to_bottom()
                     self.click_button("xpath=//*[@id=\"ng-app\"]/div[2]/div/div[3]/div[1]/div[2]/div/div/ul/li["+(str(i))+"]/a")
                     break
         except:
             log.info("no caret element for " + str(job_type))
         #self.driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
         self.scroll_to_bottom()
         self.wait_for_element(locators['job_option'])
         html = self.driver.page_source.encode("utf-8")
         page_parser = parser(html)
         page_parser.scrape_jobs(item)
Esempio n. 3
0
def main():
    # 调用Java-EntryPoint中的randInt函数
    p = parser()
    p.set_concept()
    p.set_instance()
    p.set_property()
    p.set_of()
    for c in p.concept_list:
        concept = p.concept_list[c]
        related = json.dumps(concept.related, ensure_ascii=False)
        same = json.dumps(concept.same, ensure_ascii=False)
        sub = json.dumps(concept.sub, ensure_ascii=False)
        instance = json.dumps(concept.instance, ensure_ascii=False)
        gateway.entry_point.getConceptString(c, concept.name, concept.url,
                                             related, same, sub, instance)
    gateway.entry_point.set_commit()
    for i in p.instance_list:
        ins = p.instance_list[i]
        supplement = json.dumps(ins.supplement, ensure_ascii=False)
        alias = json.dumps(ins.alias, ensure_ascii=False)
        related = json.dumps(ins.related, ensure_ascii=False)
        same = json.dumps(ins.same, ensure_ascii=False)
        comment_ins = json.dumps(ins.comment_ins, ensure_ascii=False)
        pro = json.dumps(ins.pro, ensure_ascii=False)
        gateway.entry_point.getInstanceString(i, ins.name, ins.url,
                                              ins.comment, supplement, alias,
                                              related, same, comment_ins, pro)
    gateway.entry_point.set_commit()
    for pr in p.property_list:
        property = p.property_list[pr]
        gateway.entry_point.getPropertyString(pr, property.label,
                                              property.fullname)
    gateway.entry_point.set_commit()
Esempio n. 4
0
 def parseFiles(self):
     self.parsed = []
     for filename in self.files:
         with open(filename) as vmcode:
             for line in vmcode.readlines():
                 self.parsed.append(parser(line, filename[:-3]))
     self.parsed = [
         parsed for parsed in self.parsed if not parsed["Type"] == "UNKNOWN"
     ]
Esempio n. 5
0
 def ParseButton_function(self):
     p=parser("Output.txt")
     p.intitalValues()
     p.parse_stmt_sequence()
     if p.check_correctness() == 0 :
         self.photo.setPixmap(QtGui.QPixmap("parser_syntax_tree.png"))
         self.photo.setScaledContents(True)
     else :
         self.photo.clear()
 
     self.textBrowser.clear()
     self.CheckParser(p)
Esempio n. 6
0
def test(cadena, resultadoEsperado, transicionesEsperadas):
    status = ParserStatus()
    result = parser(cadena, status)
    print('Cadena de prueba: ', cadena)
    print('Resultado parser:', result)
    print('Resultado espera:', resultadoEsperado)
    print('transiciones parser: ', status.transiciones)
    print('transiciones espera: ', transicionesEsperadas)
    if (result == resultadoEsperado
            and status.transiciones == transicionesEsperadas):
        print('\033[92mOK')
    else:
        print('\033[91mFALLO')
    print('\033[0m')
Esempio n. 7
0
def main():
    form = ItemForm()
    if form.validate_on_submit():
        barteritem = parser(form.item.data)
        marketitem = marketparser(form.item.data)
        #the two parsers can pass a string as output if there are no matches, these tryexcepts are used to solve the error of passing a string to .to_html when there is a string output.
        try:
            barter = barteritem.to_html(classes='barteritems', index=False)
        except:
            barter = barteritem
        try:
            market = marketitem.to_html(classes='marketitems', index=False)
        except:
            market = marketitem
        return render_template('main.html',
                               form=form,
                               barteritem=barter,
                               marketitem=market,
                               markettitle='Flea Market Prices',
                               bartertitle='Barters Available')
    return render_template('main.html', form=form)
Esempio n. 8
0
def goThroughAllFiles():

    path = "E:\\Wikipedia-Dataset"  #Make sure the files are in this directory

    numOfFiles = 0  #Total 109832 files have to be parsed!! Keep Calm xD

    n = 1

    done = 0  #This is just for testing. 1040 pages will be indexed. To index all, remove this variable from code.

    #Recursively goes through every folder
    for root, dirs, files in os.walk(path):

        for name in files:

            if name.endswith((".html", "htm")):

                numOfFiles += 1

                pageTitle, headings, text = parser(root + "\\" +
                                                   name)  #See Parser file

                textList = filterDoc(text)  #See FilterAndTokenize file

                if (pageTitle == None): pageTitle = name[:-5]
                else: pageTitle = pageTitle.text

                #-----FORWARD & INVERTED INDEX functions, comment one and uncomment the other to build the index-----------
                #forwardIndexer(numOfFiles, pageTitle, headings, textList, dictionaryForFI) #See ForwardIndexer file

                headings, pageTitle = porterStemmer(headings, pageTitle)

                invertedIndex(numOfFiles, headings, pageTitle, textList,
                              dictionaryForII)

                print(numOfFiles)

                done += 1

        if (done >= 80000): break
Esempio n. 9
0
    def __launch_worker(self, task):
        token = str(uuid.uuid4())
        try:

            # prepar args
            # note that phantomjs should be in your classpath.
            proxy_url = "--proxy=" + getRandomHTTPSProxy()
            proxy_type = "--proxy-type=http"
            user_agent = getRandomUserAgent()
            args = ['phantomjs']
            args[1:1] = [proxy_url, proxy_type, self.__worker_script_path, \
                task.url, str(task.times), "10000", user_agent]
            logger.info("Start worker for " + task.url + " with proxy " +
                        proxy_url)
            # start worker process
            worker = subprocess.Popen(args, stdout=subprocess.PIPE)
            result = worker.stdout.read()
            try:
                ind_arr = [ind.start() for ind in re.finditer('::::', result)]
                html = result[ind_arr[0] + 4:ind_arr[1]]
                ipnum = str(parser(html))
            except Exception:
                logger.info('fail to load url:' + task.url)
                ipnum = ''

            with codecs.open('result.txt', 'a', encoding='utf-8') as f:
                f.write(task.url[30:] + ':' + ipnum + '\n')
            # [starting_time, url, times, popen-obj]
            worker_info = (int(time.time()), task.url, task.times, worker)
            time.sleep(1)

            # update worker info
            self.__total_worker_instances += 1
            self.__workers.append(worker_info)
        except Exception as e:
            logger.error("failed to launch worker " + str(e))
Esempio n. 10
0
from wordcloud import WordCloud
from Parser import parser
from InfoParser import infoparser

ogg = parser()
ogg.leggi()
wordcloud = WordCloud().generate(text)

#image = wordcloud.to_image()
#image.show()
Esempio n. 11
0
        _, variable, value = x
        env[variable] = interpretor(value, env)
    else:
        operation = env[x[0]]
        operands = [interpretor(y, env) for y in x[1:]]
        return (operation(*operands))


def if_interpretor(x, env):
    _, test, correct, incorrect = x
    if (interpretor(test, env)):
        return (interpretor(correct, env))
    else:
        return (interpretor(incorrect, env))


def print_interpretor(x, env):
    if (type(x[1]) == list):
        print(' '.join(x[1]))
    else:
        for y in x[1:]:
            print(env[y], end=' ')
        print()


parsed_output, _ = parser(
    """(begin (if (number? *) (print (YES))(print (NO))))""")
#print(parsed_output)
interpretor(parsed_output, env)

#(if (> (val x) 0) (fn (+ (aref A i) 1) (quote (one two))))
Esempio n. 12
0
from Lexer import lexer
from Parser import parser
from Generator import generator
from Optimizer import optimizer

rules = [(r'\b[X]{1,3}[V]?[I]{1,3}\b|\b[X]{1,3}[V]?\b|' +
          r'\b[X]{0,3}[I][XV]\b|\b[V][I]{0,3}\b|\b[I]{1,3}\b', 'ROMAN_DIGIT'),
         (r'\bwhile\b', 'LOOP'), (r'\bdone\b', 'ENDLOOP'),
         (r'[A-Za-z][A-Za-z0-9_]*', 'VAR'), (r'[0-9]*\.[0-9]*', 'FLOAT'),
         (r'[1-9][0-9]*|[0]', 'INT'), (r'\<', 'LESS'), (r'\=', 'EQ'),
         (r'\>', 'LARG'), (r'\*', 'MPY'), (r'\/', 'DIV'), (r'\+', 'ADD'),
         (r'\-', 'SUB'), (r':=', 'ASSIGN'), (r'[\(\)]', 'GROUP'),
         (r'\;', 'END_EXPR'), (r'[\^\&\%\:\#\@\!\~\`\'\"\$]*', 'UNKNOWN')]

if __name__ == "__main__":
    with open('input.txt') as file:
        source_code = file.read()
    id_table, token_list = lexer(source_code, rules)
    print(id_table)
    try:
        ast = parser(token_list)
        print('Дерево разбора')
        print(ast)
        object_code = generator(ast, id_table)
        object_code = optimizer(object_code)
        with open('object_code.txt', 'w') as file:
            file.write('\n\n'.join(
                [''.join(command) for command in object_code]))
    except SyntaxError as error:
        print(error)
Esempio n. 13
0
from Parser import parser
pars = parser('machine_code')
pars.program()
Esempio n. 14
0
 def parse(self,codes):
     self.parsed = [parser(c) for c in codes]
import sys
from Parser import parser
from CodeWriter import codewriter

filename = sys.argv[1]

vm_code = parser(filename)
assembly_code = codewriter(filename, vm_code)
Esempio n. 16
0
	def run(self):
		if len(self.urlbuffer) == 0:
			return
		depth = 0
		for url in self.urlbuffer:
			#爬取URL
			failed = 0
			doneflag = 0
			while not doneflag == 1:
				try:
					if failed > self.retry:
						break
					#仿造Mozilla User Agent
					user_agent = "Mozilla/5.0 (X11; Ubuntu; Linux i686; rv:13.0) Gecko/20100101 Firefox/13.0.1"
					headers = {'User-Agent':user_agent}
					reqobj = Request(url=url,headers=headers)
					urlobj = urlopen(reqobj,timeout = self.timeout)
					content = urlobj.read()
					if len(content) == 0:
						failed += 1
						continue
					doneflag = 1
				#抛出HTMLError或URLError
				except :
					failed += 1
			if failed > self.retry:
				continue
			#slogger.notice({'msg':'url get success','url':url})
			##############?????????????可改进为多线程?????????????###################
			#爬完的网页需要解析后才能放入到数据库中
			parseobj = parser()
			self.setparser(parseobj)
			#解析抓取到的文本
			if self.parser.parsetext(content) == False:
				continue
			parse_content = ""
			for word in self.parser.wordlist:
				#保存的文本去掉换行符号
				word = word.replace('\r\n',' ').replace('\n',' ').replace('\r',' ')
				parse_content += word + ' '
			try:
				self.parser.close()
			except:
				continue
			webnodeobj = None
			#整合获取到的url信息
			newlinklist = []
			for link in self.parser.linklist:
				newlinklist.append(urljoin(url,link))
			#建立实体bean交互给indexer
			if len(self.parser.wordlist) > 0:
				webnodeobj = webnode(url,self.parser.title,content,parse_content,newlinklist,self.parser.wordlist)
			#加入到爬虫队列中
			if self.deep >= depth:
					depth += 1
			for link in newlinklist:
				if self.deep >= depth:
					self.urlbuffer.append(link)
			#解析词表,对词表建立索引
			if self.indexer != None and webnodeobj != None:
				self.indexer.index(webnodeobj)
Esempio n. 17
0
		self.spider = spider()
		self.indexer = indexer()
		self.parser = parser()
		self.urllist = urllist
	
	def start(self):
		if len(self.urllist) == 0:
			return False
		self.spider.addurllist(self.urllist)
		self.spider.setparser(self.parser)
		self.spider.setindexer(self.indexer)
		spider.run()
		return True

	def cleanup(self):
		self.indexer.closedb()
	

if __name__ == "__main__":

	spider = spider()
	#spider.addurl('http://localhost:9080/setest/test.php')
	spider.addurl('http://hq.booksarefun.com/')
	parserobj = parser()
	indexobj = indexer()
	spider.setparser(parserobj)
	spider.setindexer(indexobj)
	spider.run()
	indexobj.closedb()
	print 'done!'
Esempio n. 18
0
	def __init__(self,urllist = []):
		self.spider = spider()
		self.indexer = indexer()
		self.parser = parser()
		self.urllist = urllist
Esempio n. 19
0
        self.spider = spider()
        self.indexer = indexer()
        self.parser = parser()
        self.urllist = urllist

    def start(self):
        if len(self.urllist) == 0:
            return False
        self.spider.addurllist(self.urllist)
        self.spider.setparser(self.parser)
        self.spider.setindexer(self.indexer)
        spider.run()
        return True

    def cleanup(self):
        self.indexer.closedb()


if __name__ == "__main__":

    spider = spider()
    #spider.addurl('http://localhost:9080/setest/test.php')
    spider.addurl('http://hq.booksarefun.com/')
    parserobj = parser()
    indexobj = indexer()
    spider.setparser(parserobj)
    spider.setindexer(indexobj)
    spider.run()
    indexobj.closedb()
    print 'done!'
#################################################################
##				   	       MAIN FILE	   					   ##
#################################################################

from Sprint1 import sprint1
from Sprint2 import sprint2
from Sprint3 import sprint3
from Sprint4 import sprint4
from Parser import parser, individual, family

parser()
sprint1(individual, family)
sprint2(individual, family)
sprint3(individual, family)
sprint4(individual, family)
Esempio n. 21
0
 def __init__(self, urllist=[]):
     self.spider = spider()
     self.indexer = indexer()
     self.parser = parser()
     self.urllist = urllist