def main(query): x = int(sys.argv[1]) if(x==1): kmp.kmp(sys.argv[2]) elif(x==2): bm.bm(sys.argv[2]) elif(x==3): regex.regex(sys.argv[2])
def __init__(self): self.re = regex() self.map = my_map() self.clf = None self.vocab = None self.re = regex() self.map = my_map() self.strong_learner = None self.vocab = None self.max_length = None
def __init__(self, run=True): self.re = regex() self.map = my_map() self.clf = None self.vocab = None self.re = regex() self.map = my_map() self.strong_learner = None self.vocab = None self.max_length = None self.spliter = SentenceSpliter() if run: self.run()
def __init__(self): self.re = regex() self.map = my_map() self.clf = None self.vocab = None self.re = regex() self.map = my_map() self.strong_learner = None self.vocab = None self.max_length = None self.spliter = SentenceSpliter() Tokenizer.run(self)
def extractContent(content, settings): """ Extract the desired content from the supplied raw text from a file. Inputs: filename[unicode]: The file to read (known to exist already). settings[dict]: The setting from the createToken method. """ raw = copy.copy(content) # complete copy of original if settings['re'] is not None: content = regex(settings['re'], content, eval(settings['re-flags'])) elif settings['line'] is not None: content = extractLine(content, settings["line"]) elif (settings['start'] is not None) or (settings['end'] is not None): content = extractLineRange(content, settings['start'], settings['end'], settings['include-start'], settings['include-end']) content = prepareContent(content, settings) # Locate the line line = 1 match = re.search(r'(.*?)$', content, flags=re.MULTILINE) if match is not None: first = match.group(1) for i, raw_line in enumerate(raw.splitlines()): if first in raw_line: line = i continue return content, line
def __init__(self, uploaded_files, text, option): if (option == "pilihan1"): kmp_0 = kmp.kmp() kmp_0.convertText(uploaded_files) if (kmp_0.kmpMatch(text.lower()) == -1): self.hasil = [[ '', "Tidak ditemukan " + text + " pada file.", '' ]] else: self.hasil = [[ '', "indeks pada file: " + str(kmp_0.kmpMatch(text.lower())), '' ]] elif (option == "pilihan2"): boyer = boyce.boyce() boyer.convertText(uploaded_files) if (boyer.bmMatch(text.lower()) == -1): self.hasil = [[ '', "Tidak ditemukan " + text + " pada file.", '' ]] else: self.hasil = [[ '', "indeks pada file: " + str(boyer.bmMatch(text.lower())), '' ]] else: reg = regex.regex(uploaded_files) self.hasil = reg.regexMatch(text)
def formatPcb(sub_pcb,tmpDir,copies): bltup = tuple([0,0,0,0]) for cop in range(copies): let = chr(ord('A') + cop) reg_conn = re.compile(r"(.*)Connect\(\"(\D+)(\d+)") subs_conn = r'\1Connect("\2_%sx\3' % let reg_ele = re.compile(regex('EleRef',bltup)) subs_ele = r'Element[\1 \2 "\3_%sx\4" \5' % let reg_Net = re.compile(r'Net\("(\w+)" "(.*)"\)') subs_Net = r'Net("\1_%sx" "\2")' % let reg_Netpgnd = re.compile(r'Net\("PGND"') file = "%s%s%s" % (tmpDir,sub_pcb,cop) print file for line in fileinput.input([file],inplace = True): mat_conn = reg_conn.findall(line) mat_ele = reg_ele.findall(line) mat_Net = reg_Net.findall(line) mat_Netpgnd = reg_Netpgnd.findall(line) if mat_conn: line = reg_conn.sub(subs_conn,line) elif mat_ele: line = reg_ele.sub(subs_ele,line) elif mat_Net: if not(mat_Netpgnd): line = reg_Net.sub(subs_Net,line) print line,
def removeBox_pcb(file,pts): l1 = regex('l1',pts) l2 = regex('l2',pts) l3 = regex('l3',pts) l4 = regex('l4',pts) for line in fileinput.input([file],inplace=True): if (line == "%s%s" %(l1,"\n")): a =1 elif (line == "%s%s" % (l2,"\n")): a=1 elif (line == "%s%s" % (l3,"\n")): a=1 elif (line == "%s%s" % (l4,"\n")): a=1 else: print line
def run_M_m(V): if V.pending_command[-1:] == CR: V.pending_command = V.pending_command[1:-1] reg = regex.regex(":%s/") reg.source(V.pending_command) command = reg.get_final() V.input(command) V.pending_command = ""
def getRefdegs(sub_sch,dir): """Search through sub_sch and return all reference designators.""" refdegs = [] bltup = tuple([0,0,0,0]) file = "%s%s0" % (dir,sub_sch) reg = re.compile(regex('ref',bltup)) subs = regex('sref',bltup) for line in fileinput.input([file],inplace = True): res = reg.findall(line) if res: pre = res[0][0] post = res[0][1] refdegs.append((pre,post)) line = reg.sub(subs,line) sys.stdout.write(line) return refdegs
def run_M_s(V): if V.pending_command[-1:] == CR: V.pending_command = V.pending_command[1:-1] reg = regex.regex(":s/") l = V.pending_command.split("/") reg.source(l) command = reg.get_final() V.input(command) V.pending_command = ""
def run_M_S(V): if V.pending_command[-1:] == CR: V.pending_command = V.pending_command[1:-1] reg = regex.regex(":s/") reg.source(V.pending_command) reg.add_flag('g') command = reg.get_final() V.input(command) V.pending_command = ""
def drawBox_pcb(file,pts): l1 = regex('l1',pts) l2 = regex('l2',pts) l3 = regex('l3',pts) l4 = regex('l4',pts) reg_dr = re.compile(r"Layer\(1 ") prtNext = False for line in fileinput.input([file],inplace=True): mat = reg_dr.findall(line) if prtNext == True: prtNext = False print line, print l1 print l2 print l3 print l4 else: print line, if mat: a = 1 prtNext = True
def formatSch(sub_sch,tmpDir,copies): """Format all .sch files in dir to have incremented refdegs""" # change .sch files bltup = tuple([0,0,0,0]) reg = re.compile(regex('stdref',bltup)) for cop in range(0,copies): let = chr(ord('A') + cop) subs = r"refdes=\1_%sx\2" % (let) file = "%s%s%s" % (tmpDir,sub_sch,cop) for line in fileinput.input([file],inplace=True): mat = reg.sub(subs,line) print mat,
def js_detect(url, r, debug=False): ''' url = "" debug = False call_count = list() sub_func_dict = dict() char_freq_dict = dict() string_len_dict = dict() parsed = "" reg_result = "" ''' parser = PyJsParser() soup = bs(r, 'html.parser') tot_script = "" for script in soup.find_all('script'): out = "" try: out = script['src'] if debug: print("getting outer js") #getting scripts not in site ''' if out[:4] != "http": tot_script = get_outer_js(urljoin(self.url, out)) else: tot_script = get_outer_js(out) ''' except: tot_script += script.get_text() reg_result = [] if tot_script != "": ''' count_char(tot_script) a = parser.parse(tot_script) read_dic_2(a) read_dic(a) ''' reg_result = regex().match(tot_script) return reg_result
# -*- encoding: utf-8 -*- import unicodedata import regex from nlp_tools import tokenizer my_regex = regex.regex() def is_exist(dictionary, element): try: _ = dictionary[element] return True except: return False def preprocessing(data, tokenize=True): data = unicodedata.normalize('NFKC', data) if tokenize: data = tokenizer.predict(data) data = my_regex.detect_url.sub(u'', data) data = my_regex.detect_url2.sub(u'', data) data = my_regex.detect_email.sub(u'', data) data = my_regex.detect_datetime.sub(u'', data) data = my_regex.detect_num.sub(u'', data) data = my_regex.normalize_special_mark.sub(u' \g<special_mark> ', data) data = my_regex.detect_exception_chars.sub(u'', data) data = my_regex.detect_special_mark.sub(u'', data) data = my_regex.detect_special_mark2.sub(u'', data) data = my_regex.detect_special_mark3.sub(u'', data)
# -*- encoding: utf-8 -*- import regex import os, sys import my_map import utils from io import open import unicodedata from nlp_tools import tokenizer r = regex.regex() def load_dataset_from_disk(dataset): list_samples = {k: [] for k in my_map.name2label.keys()} stack = os.listdir(dataset) print 'loading data in ' + dataset while (len(stack) > 0): file_name = stack.pop() file_path = os.path.join(dataset, file_name) if (os.path.isdir(file_path)): utils.push_data_to_stack(stack, file_path, file_name) else: print('\r%s' % (file_path)), sys.stdout.flush() with open(file_path, 'r', encoding='utf-16') as fp: content = unicodedata.normalize('NFKC', fp.read()) content = r.run(tokenizer.predict(content)) dir_name = utils.get_dir_name(file_path) list_samples[dir_name].append(content) print('')
# a simple lexer that tokenizes based on whitespace from regex import regex lowercase = "a|b|c|d|e|f|g|h|i|j|k|l|m|n|o|p|q|r|s|t|u|v|w|x|y|z" uppercase = "A|B|C|D|E|F|G|H|I|J|K|L|M|N|O|P|Q|R|S|T|U|V|W|X|Y|Z" symbols = "+|-|=|.|,|/|%|^|<|>" alphabet = lowercase rule = regex("(%s)(%s)*"%(alphabet, alphabet)) white_space = regex(" *") def lex(s): s = s[white_space.greedy(s):] if not s: return [] word_c = rule.greedy(s) return [s[:word_c]]+lex(s[word_c:]) print lex("abc def ghijk")
def copy(pcb_file,spcb_file,copies,projPath,tmpPath): """ *get MinMax pts from pcb and sub *generate offsets for all copies *open temporary files **Elements **nets *Look in pcb0 **Skip lines until after attribute line **copy elements from all files into elemTemp file ***shift coordinates when adding elems to this file **copy layer1 lines into lay1temp file ***shift coord **copy layer2 lines into lay2temp file ***shift coord **copy Nets to be inserted in existing pcb nets into CopnetTemp **copy Nets to be combined with existing pcb nets in ComNetTemp """ blTup = tuple([0,0,0,0]) pcb_file = "%s%s" % (projPath,pcb_file) subpcb_file = "%s%s%s" % (tmpPath,spcb_file,0) shift = getShift(pcb_file,subpcb_file,copies) x_shift = shift[0] y_shift = shift[1] reg_befAt = re.compile(regex('befAt',blTup)) reg_ele = re.compile(regex('Ele',blTup)) reg_lay1 = re.compile(regex('lay1',blTup)) reg_lay2 = re.compile(regex('lay2',blTup)) reg_lay3 = re.compile(regex('lay3',blTup)) reg_line = re.compile(regex('line',blTup)) reg_netcop = re.compile(regex('net',blTup)) reg_netcom = re.compile(regex('netpgnd',blTup)) ele = [] lay1 = [] lay2 = [] inNetcop = [] inNetcom = [] for cop in range(copies): subpcb_file = "%s%s%s" % (tmpPath,spcb_file,cop) state = searchEnum.befAtt # Initialize searching state for line in fileinput.input([subpcb_file],inplace = False): if state == searchEnum.befAtt: mat = reg_ele.findall(line) if mat: mat = mat[0] state = searchEnum.inEle x = int(mat[1]) + x_shift[cop] y = int(mat[2]) + y_shift[cop] line = "Element[%s %s %s %s" % (mat[0],x,y,mat[3]) ele.append(line) elif state == searchEnum.inEle: mat = reg_lay1.findall(line) mat_ele = reg_ele.findall(line) if mat_ele: mat_ele = mat_ele[0] state = searchEnum.inEle x = int(mat_ele[1]) + x_shift[cop] y = int(mat_ele[2]) + y_shift[cop] line = "Element[%s %s %s %s" % (mat_ele[0],x,y,mat_ele[3]) ele.append(line) elif mat: state = searchEnum.inlay1 else: ele.append(line) elif state == searchEnum.inlay1: mat = reg_lay2.findall(line) matig = re.compile(r"^(\()$").findall(line) matig2 = re.compile(r"^(\))$").findall(line) if mat: state = searchEnum.inlay2 elif matig or matig2: a = 1 else: lnpts = reg_line.findall(line)[0] x0 = int(lnpts[0]) + x_shift[cop] y0 = int(lnpts[1]) + y_shift[cop] x1 = int(lnpts[2]) + x_shift[cop] y1 = int(lnpts[3]) + y_shift[cop] line = " Line[%s %s %s %s %s\n" % (x0,y0,x1,y1,lnpts[4]) lay1.append(line) elif state == searchEnum.inlay2: mat = reg_lay3.findall(line) matig = re.compile(r"^(\()$").findall(line) matig2 = re.compile(r"^(\))$").findall(line) if mat: state = searchEnum.inlay3 elif matig or matig2: a = 1 else: lnpts = reg_line.findall(line)[0] x0 = int(lnpts[0]) + x_shift[cop] y0 = int(lnpts[1]) + y_shift[cop] x1 = int(lnpts[2]) + x_shift[cop] y1 = int(lnpts[3]) + y_shift[cop] line = " Line[%s %s %s %s %s\n" % (x0,y0,x1,y1,lnpts[4]) lay2.append(line) elif state == searchEnum.inlay3: mat_cop = reg_netcop.findall(line) mat_com = reg_netcom.findall(line) if mat_com: inNetcom.append(line) state = searchEnum.inNetcom elif mat_cop: inNetcop.append(line) state = searchEnum.inNetcop elif state == searchEnum.inNetcom: mat_cop = reg_netcop.findall(line) mat_com = reg_netcom.findall(line) if mat_cop: inNetcop.append(line) state = searchEnum.inNetcop else: inNetcom.append(line) elif state == searchEnum.inNetcop: mat_cop = reg_netcop.findall(line) mat_com = reg_netcom.findall(line) if mat_com: state = searchEnum.inNetcom inNetcom.append(line) elif mat_cop: inNetcop.append(line) else: inNetcop.append(line) #Remove ending parenthesis temp = [] reg = re.compile(r'^\)') for ln in inNetcop: mat = reg.findall(ln) if not(mat): temp.append(ln) inNetcop = temp #Format the insert combined nets into a dictionary key = 'none' netComDict = {key:[]} for ln in inNetcom: mat = reg_netcom.findall(ln) mat_ign = re.compile(r'\($').findall(ln) mat_ign2 = re.compile(r'^\t\)$').findall(ln) if mat_ign or mat_ign2: a = 1 elif mat: key = mat[0] else: try: netComDict[key].append(ln) except KeyError : netComDict[key] = [] netComDict[key].append(ln) #Now move the list vars to the main pcb file ''' ele = [] lay1 = [] lay2 = [] inNetcop = [] inNetcom = [] class searchEnum: befAtt = 0 # Search has not yet passed the Attribute line inEle = 1 # Search is in the elements section inlay1 = 2 # Search is in the layer1 section inlay2 = 3 # Search is in the layer2 section inlay3 = 4 inNetcom = 5 # Search is in the nets to be copied section inNetcop = 6 # Search is in the nets to be combined section reg_befAt = re.compile(regex('befAt',blTup)) reg_ele = re.compile(regex('Ele',blTup)) reg_lay1 = re.compile(regex('lay1',blTup)) reg_lay2 = re.compile(regex('lay2',blTup)) reg_lay3 = re.compile(regex('lay3',blTup)) reg_line = re.compile(regex('line',blTup)) reg_netcop = re.compile(regex('net',blTup)) reg_netcom = re.compile(regex('netpgnd',blTup)) ''' state = searchEnum.befAtt for line in fileinput.input([pcb_file],inplace = True): if state == searchEnum.befAtt: mat = reg_ele.findall(line) if mat: state = searchEnum.inEle for ln in ele: print ln, print line, else: print line, elif state == searchEnum.inEle: mat = reg_lay1.findall(line) if mat: state = searchEnum.inlay1 print line, print "(\n" for ln in lay1: print ln, else: print line, elif state == searchEnum.inlay1: mat_ign = re.compile(r'^\(').findall(line) mat = reg_lay2.findall(line) if mat_ign: a = 1 elif mat: state = searchEnum.inlay2 print line, print "(\n" for ln in lay2: print ln, else: print line, elif state == searchEnum.inlay2: mat_ign = re.compile(r'^\(').findall(line) mat = reg_lay3.findall(line) if mat_ign: a = 1 elif mat: state = searchEnum.inlay3 print line, else: print line, elif state == searchEnum.inlay3: mat_net = re.compile(r'^NetList\(\)').findall(line) if mat_net: state = searchEnum.inNet print line, print "(\n" for ln in inNetcop: print ln, else: print line, elif state == searchEnum.inNet: mat_ign = re.compile(r'^\(').findall(line) mat_com = reg_netcom.findall(line) if mat_ign: a = 1 elif mat_com: print line, print " (\n" for ln in netComDict[mat_com[0]]: print ln, state = searchEnum.inNetcop else: print line, elif state == searchEnum.inNetcop: state = searchEnum.inNet #eof ''' print "lay2\n" for ln in lay2: print ln, ''' ''' print "netComDict\n" for ln in netComDict['PGND']: print ln, ''' '''
return connection, db if __name__ == '__main__': import io from regex import regex from tokenizer.tokenizer import Tokenizer config = get_stories.config connection, db = get_stories.connect2mongo(config['MONGO_HOST'], config['MONGO_PORT'], config['MONGO_USER'], config['MONGO_PASS'], config['MONGO_DB']) stories = get_stories() stories.run(db) r = regex() # Prepare Data with io.open('vie.txt', 'w+', encoding="utf-8") as f: for story in stories.new_stories: reg_text = r.run(story) token_text = ViTokenizer.tokenize(reg_text) f.write(token_text.lower()) f.write(u'\n') print "Save data success into vie.txt" elapsed_time = time.time() - start_time print "Total_Time for Excute: ", elapsed_time connection.close()