def main(): param = sys.argv[1:] commit = param[0] dir_projeto = param[1] dir_result = param[2] localpath = param[3] nome_projeto = param[4] parser(commit, dir_projeto, dir_result, localpath, nome_projeto)
def get_jobs(self,job_type): for item in job_type: self.scroll_to_top() self.wait_for_element(locators['job_option']) select = Select(self.find_element_by_locator(locators['job_option'])) select.select_by_visible_text(item) try: self.find_element_by_locator(locators['image']).send_keys(Keys.ENTER) except: self.find_element_by_locator(locators['image1']).send_keys(Keys.ENTER) try: self.wait_for_element(locators['caret']) self.scroll_to_bottom() self.click_button(locators['caret']) #self.driver.execute_script("window.scrollTo(0, document.body.scrollHeight);") # Used a loop to list all items of the job listing for i in range(1,1000): try: self.wait_for_element("xpath=//*[@id=\"ng-app\"]/div[2]/div/div[3]/div[1]/div[2]/div/div/ul/li["+str(i)+"]/a") except: i=i-1 log.info("Not Found_element_SHOW_ALL"+ str(i)) self.scroll_to_bottom() self.click_button("xpath=//*[@id=\"ng-app\"]/div[2]/div/div[3]/div[1]/div[2]/div/div/ul/li["+(str(i))+"]/a") break except: log.info("no caret element for " + str(job_type)) #self.driver.execute_script("window.scrollTo(0, document.body.scrollHeight);") self.scroll_to_bottom() self.wait_for_element(locators['job_option']) html = self.driver.page_source.encode("utf-8") page_parser = parser(html) page_parser.scrape_jobs(item)
def main(): # 调用Java-EntryPoint中的randInt函数 p = parser() p.set_concept() p.set_instance() p.set_property() p.set_of() for c in p.concept_list: concept = p.concept_list[c] related = json.dumps(concept.related, ensure_ascii=False) same = json.dumps(concept.same, ensure_ascii=False) sub = json.dumps(concept.sub, ensure_ascii=False) instance = json.dumps(concept.instance, ensure_ascii=False) gateway.entry_point.getConceptString(c, concept.name, concept.url, related, same, sub, instance) gateway.entry_point.set_commit() for i in p.instance_list: ins = p.instance_list[i] supplement = json.dumps(ins.supplement, ensure_ascii=False) alias = json.dumps(ins.alias, ensure_ascii=False) related = json.dumps(ins.related, ensure_ascii=False) same = json.dumps(ins.same, ensure_ascii=False) comment_ins = json.dumps(ins.comment_ins, ensure_ascii=False) pro = json.dumps(ins.pro, ensure_ascii=False) gateway.entry_point.getInstanceString(i, ins.name, ins.url, ins.comment, supplement, alias, related, same, comment_ins, pro) gateway.entry_point.set_commit() for pr in p.property_list: property = p.property_list[pr] gateway.entry_point.getPropertyString(pr, property.label, property.fullname) gateway.entry_point.set_commit()
def parseFiles(self): self.parsed = [] for filename in self.files: with open(filename) as vmcode: for line in vmcode.readlines(): self.parsed.append(parser(line, filename[:-3])) self.parsed = [ parsed for parsed in self.parsed if not parsed["Type"] == "UNKNOWN" ]
def ParseButton_function(self): p=parser("Output.txt") p.intitalValues() p.parse_stmt_sequence() if p.check_correctness() == 0 : self.photo.setPixmap(QtGui.QPixmap("parser_syntax_tree.png")) self.photo.setScaledContents(True) else : self.photo.clear() self.textBrowser.clear() self.CheckParser(p)
def test(cadena, resultadoEsperado, transicionesEsperadas): status = ParserStatus() result = parser(cadena, status) print('Cadena de prueba: ', cadena) print('Resultado parser:', result) print('Resultado espera:', resultadoEsperado) print('transiciones parser: ', status.transiciones) print('transiciones espera: ', transicionesEsperadas) if (result == resultadoEsperado and status.transiciones == transicionesEsperadas): print('\033[92mOK') else: print('\033[91mFALLO') print('\033[0m')
def main(): form = ItemForm() if form.validate_on_submit(): barteritem = parser(form.item.data) marketitem = marketparser(form.item.data) #the two parsers can pass a string as output if there are no matches, these tryexcepts are used to solve the error of passing a string to .to_html when there is a string output. try: barter = barteritem.to_html(classes='barteritems', index=False) except: barter = barteritem try: market = marketitem.to_html(classes='marketitems', index=False) except: market = marketitem return render_template('main.html', form=form, barteritem=barter, marketitem=market, markettitle='Flea Market Prices', bartertitle='Barters Available') return render_template('main.html', form=form)
def goThroughAllFiles(): path = "E:\\Wikipedia-Dataset" #Make sure the files are in this directory numOfFiles = 0 #Total 109832 files have to be parsed!! Keep Calm xD n = 1 done = 0 #This is just for testing. 1040 pages will be indexed. To index all, remove this variable from code. #Recursively goes through every folder for root, dirs, files in os.walk(path): for name in files: if name.endswith((".html", "htm")): numOfFiles += 1 pageTitle, headings, text = parser(root + "\\" + name) #See Parser file textList = filterDoc(text) #See FilterAndTokenize file if (pageTitle == None): pageTitle = name[:-5] else: pageTitle = pageTitle.text #-----FORWARD & INVERTED INDEX functions, comment one and uncomment the other to build the index----------- #forwardIndexer(numOfFiles, pageTitle, headings, textList, dictionaryForFI) #See ForwardIndexer file headings, pageTitle = porterStemmer(headings, pageTitle) invertedIndex(numOfFiles, headings, pageTitle, textList, dictionaryForII) print(numOfFiles) done += 1 if (done >= 80000): break
def __launch_worker(self, task): token = str(uuid.uuid4()) try: # prepar args # note that phantomjs should be in your classpath. proxy_url = "--proxy=" + getRandomHTTPSProxy() proxy_type = "--proxy-type=http" user_agent = getRandomUserAgent() args = ['phantomjs'] args[1:1] = [proxy_url, proxy_type, self.__worker_script_path, \ task.url, str(task.times), "10000", user_agent] logger.info("Start worker for " + task.url + " with proxy " + proxy_url) # start worker process worker = subprocess.Popen(args, stdout=subprocess.PIPE) result = worker.stdout.read() try: ind_arr = [ind.start() for ind in re.finditer('::::', result)] html = result[ind_arr[0] + 4:ind_arr[1]] ipnum = str(parser(html)) except Exception: logger.info('fail to load url:' + task.url) ipnum = '' with codecs.open('result.txt', 'a', encoding='utf-8') as f: f.write(task.url[30:] + ':' + ipnum + '\n') # [starting_time, url, times, popen-obj] worker_info = (int(time.time()), task.url, task.times, worker) time.sleep(1) # update worker info self.__total_worker_instances += 1 self.__workers.append(worker_info) except Exception as e: logger.error("failed to launch worker " + str(e))
from wordcloud import WordCloud from Parser import parser from InfoParser import infoparser ogg = parser() ogg.leggi() wordcloud = WordCloud().generate(text) #image = wordcloud.to_image() #image.show()
_, variable, value = x env[variable] = interpretor(value, env) else: operation = env[x[0]] operands = [interpretor(y, env) for y in x[1:]] return (operation(*operands)) def if_interpretor(x, env): _, test, correct, incorrect = x if (interpretor(test, env)): return (interpretor(correct, env)) else: return (interpretor(incorrect, env)) def print_interpretor(x, env): if (type(x[1]) == list): print(' '.join(x[1])) else: for y in x[1:]: print(env[y], end=' ') print() parsed_output, _ = parser( """(begin (if (number? *) (print (YES))(print (NO))))""") #print(parsed_output) interpretor(parsed_output, env) #(if (> (val x) 0) (fn (+ (aref A i) 1) (quote (one two))))
from Lexer import lexer from Parser import parser from Generator import generator from Optimizer import optimizer rules = [(r'\b[X]{1,3}[V]?[I]{1,3}\b|\b[X]{1,3}[V]?\b|' + r'\b[X]{0,3}[I][XV]\b|\b[V][I]{0,3}\b|\b[I]{1,3}\b', 'ROMAN_DIGIT'), (r'\bwhile\b', 'LOOP'), (r'\bdone\b', 'ENDLOOP'), (r'[A-Za-z][A-Za-z0-9_]*', 'VAR'), (r'[0-9]*\.[0-9]*', 'FLOAT'), (r'[1-9][0-9]*|[0]', 'INT'), (r'\<', 'LESS'), (r'\=', 'EQ'), (r'\>', 'LARG'), (r'\*', 'MPY'), (r'\/', 'DIV'), (r'\+', 'ADD'), (r'\-', 'SUB'), (r':=', 'ASSIGN'), (r'[\(\)]', 'GROUP'), (r'\;', 'END_EXPR'), (r'[\^\&\%\:\#\@\!\~\`\'\"\$]*', 'UNKNOWN')] if __name__ == "__main__": with open('input.txt') as file: source_code = file.read() id_table, token_list = lexer(source_code, rules) print(id_table) try: ast = parser(token_list) print('Дерево разбора') print(ast) object_code = generator(ast, id_table) object_code = optimizer(object_code) with open('object_code.txt', 'w') as file: file.write('\n\n'.join( [''.join(command) for command in object_code])) except SyntaxError as error: print(error)
from Parser import parser pars = parser('machine_code') pars.program()
def parse(self,codes): self.parsed = [parser(c) for c in codes]
import sys from Parser import parser from CodeWriter import codewriter filename = sys.argv[1] vm_code = parser(filename) assembly_code = codewriter(filename, vm_code)
def run(self): if len(self.urlbuffer) == 0: return depth = 0 for url in self.urlbuffer: #爬取URL failed = 0 doneflag = 0 while not doneflag == 1: try: if failed > self.retry: break #仿造Mozilla User Agent user_agent = "Mozilla/5.0 (X11; Ubuntu; Linux i686; rv:13.0) Gecko/20100101 Firefox/13.0.1" headers = {'User-Agent':user_agent} reqobj = Request(url=url,headers=headers) urlobj = urlopen(reqobj,timeout = self.timeout) content = urlobj.read() if len(content) == 0: failed += 1 continue doneflag = 1 #抛出HTMLError或URLError except : failed += 1 if failed > self.retry: continue #slogger.notice({'msg':'url get success','url':url}) ##############?????????????可改进为多线程?????????????################### #爬完的网页需要解析后才能放入到数据库中 parseobj = parser() self.setparser(parseobj) #解析抓取到的文本 if self.parser.parsetext(content) == False: continue parse_content = "" for word in self.parser.wordlist: #保存的文本去掉换行符号 word = word.replace('\r\n',' ').replace('\n',' ').replace('\r',' ') parse_content += word + ' ' try: self.parser.close() except: continue webnodeobj = None #整合获取到的url信息 newlinklist = [] for link in self.parser.linklist: newlinklist.append(urljoin(url,link)) #建立实体bean交互给indexer if len(self.parser.wordlist) > 0: webnodeobj = webnode(url,self.parser.title,content,parse_content,newlinklist,self.parser.wordlist) #加入到爬虫队列中 if self.deep >= depth: depth += 1 for link in newlinklist: if self.deep >= depth: self.urlbuffer.append(link) #解析词表,对词表建立索引 if self.indexer != None and webnodeobj != None: self.indexer.index(webnodeobj)
self.spider = spider() self.indexer = indexer() self.parser = parser() self.urllist = urllist def start(self): if len(self.urllist) == 0: return False self.spider.addurllist(self.urllist) self.spider.setparser(self.parser) self.spider.setindexer(self.indexer) spider.run() return True def cleanup(self): self.indexer.closedb() if __name__ == "__main__": spider = spider() #spider.addurl('http://localhost:9080/setest/test.php') spider.addurl('http://hq.booksarefun.com/') parserobj = parser() indexobj = indexer() spider.setparser(parserobj) spider.setindexer(indexobj) spider.run() indexobj.closedb() print 'done!'
def __init__(self,urllist = []): self.spider = spider() self.indexer = indexer() self.parser = parser() self.urllist = urllist
################################################################# ## MAIN FILE ## ################################################################# from Sprint1 import sprint1 from Sprint2 import sprint2 from Sprint3 import sprint3 from Sprint4 import sprint4 from Parser import parser, individual, family parser() sprint1(individual, family) sprint2(individual, family) sprint3(individual, family) sprint4(individual, family)
def __init__(self, urllist=[]): self.spider = spider() self.indexer = indexer() self.parser = parser() self.urllist = urllist