def cleanup_csv(line): ''' Introduced in Version 3.2.4, I wrote this function in response to a need to create a decision tree for a very large national econometric database. The fields in the CSV file for this database are allowed to be double quoted and such fields may contain commas inside them. This function also replaces empty fields with the generic string 'NA' as a shorthand for "Not Available". IMPORTANT: This function skips over the first field in each record. It is assumed that the first field in each record is an ID number for the record. ''' line = line.translate(bytes.maketrans(b"()[]{}'", b" ")) \ if sys.version_info[0] == 3 else line.translate(string.maketrans("()[]{}'", " ")) double_quoted = re.findall(r'"[^\"]+"', line[line.find(',') : ]) for item in double_quoted: clean = re.sub(r',', r'', item[1:-1].strip()) parts = re.split(r'\s+', clean.strip()) line = str.replace(line, item, '_'.join(parts)) white_spaced = re.findall(r',\s*[^,]+\s+[^,]+\s*,', line) for item in white_spaced: if re.match(r',\s+,', item) : continue replacement = '_'.join(re.split(r'\s+', item[:-1].strip())) + ',' line = str.replace(line, item, replacement) fields = re.split(r',', line) newfields = [] for field in fields: newfield = field.strip() if newfield == '': newfields.append('NA') else: newfields.append(newfield) line = ','.join(newfields) return line
def summary_up_result(result_file, ignore, row_head, column_mark): """ Use to summary the monitor or other kinds of results. Now it calculates the average value for each item in the results. It fits to the records that are in matrix form. @result_file: files which need to calculate @ignore: pattern for the comment in results which need to through away @row_head: pattern for the items in row @column_mark: pattern for the first line in matrix which used to generate the items in column Return: A dictionary with the average value of results """ head_flag = False result_dict = {} column_list = {} row_list = [] fd = open(result_file, "r") for eachLine in fd: if len(re.findall(ignore, eachLine)) == 0: if len(re.findall(column_mark, eachLine)) != 0 and not head_flag: column = 0 _, row, eachLine = re.split(row_head, eachLine) for i in re.split("\s+", eachLine): if i: result_dict[i] = {} column_list[column] = i column += 1 head_flag = True elif len(re.findall(column_mark, eachLine)) == 0: column = 0 _, row, eachLine = re.split(row_head, eachLine) row_flag = False for i in row_list: if row == i: row_flag = True if row_flag is False: row_list.append(row) for i in result_dict: result_dict[i][row] = [] for i in re.split("\s+", eachLine): if i: result_dict[column_list[column]][row].append(i) column += 1 fd.close() # Calculate the average value average_list = {} for i in column_list: average_list[column_list[i]] = {} for j in row_list: average_list[column_list[i]][j] = {} check = result_dict[column_list[i]][j][0] if utils_misc.aton(check) or utils_misc.aton(check) == 0.0: count = 0 for k in result_dict[column_list[i]][j]: count += utils_misc.aton(k) average_list[column_list[i]][j] = "%.2f" % (count / len(result_dict[column_list[i]][j])) return average_list
def hm_tree(p, nwords, split_lvl=0, ctype=''): splits = [r';', r','] split_lvl = min(split_lvl, len(splits)-1) # check for method claim if ctype == '': if re.search(r'method', p) > 0: ctype = 'method' else: ctype = 'device' # first split along parent -> [children] line if ctype == 'device': split_words = r'(compris\w+|has|having|including)' else: split_words = r'(compris\w+\sthe\ssteps\sof)' split_markers = r'.*?(?::|-|\s)(?:\sa\splurality\sof)?' parts_rgx = r'^(.*?)' + split_words + split_markers + r'(.*)$' parts = re.match(parts_rgx, p) if parts: # NOTE: could change which words from head chunk are selected here parent = get_head_words(parts.group(1), nwords, ctype) # then split the [children] array children = re.split(splits[split_lvl] + r'(?:\s*and)?', parts.group(3)) if len(children) == 1: children = re.split(r'and', parts.group(3)) return Tree(parent, merge_trees([hm_tree(child, nwords, split_lvl+1, ctype) for child in children])) else: # try splitting on splitters here # NOTE: to do later...? danger of pulling in lots of crap return Tree(get_head_words(p, nwords, ctype), [])
def parse_dmapdumpstring(dumpstring): scandata = {} scan = dumpstring.split('scalars:')[-1].split('arrays:') scalars = scan[0].split('\n') vectors = re.split(VECTOR_SPLITTER, scan[1]) for scalar in scalars: if scalar == '': continue assignment = scalar.split('\t')[-1].split(' = ') var = assignment[0].lstrip('"').rstrip('"') value = eval(assignment[1]) scandata[var] = value for vector in vectors: vector = vector.split('=') if len(vector) <= 1: continue var = vector[0].split('"')[1] vecvalue = [] for v in re.split(ELEM_SPLITTER, vector[1]): v = v.rstrip(',') if v == '': continue if v == 'inf' or v == 'nan' or v == '-nan': v = 'float("NaN")' try: vecvalue.append(eval(v)) except: print 'error parsing vector' scandata[var] = np.array(vecvalue) return scandata
def get_head_words(s, nwords, ctype): #print ctype #print s # first limit to before any commas, semicolons; and remove stop list phrases s = re.split(r';,', s)[0] remove_list = r'(a\splurality\sof\s|at\sleast|composition\sof|the\ssteps\sof|wherein\s*(?:said)?|first|second|third|(?:[a-z]|\d+)?(?:\)|\.))' s = re.sub(remove_list, '', s) if ctype == 'device': # get first ~ <JJ>*<NN>+ chunk return first_JN_chunk(s, nwords) elif ctype == 'method': # first try to split around "method" (for first parent node) msplit1 = re.split(r'method\s(of|for|to)', s) if len(msplit1) > 1: return first_V_chunk(msplit1[2], nwords) msplit2 = re.split(r'method', s) if len(msplit2) > 1: return first_V_chunk(msplit2[0], nwords) # else, get first VBG + its subject if possible return first_V_chunk(s, nwords)
def _extract_metadata(content): tree = etree.fromstring(content) ns = {'xhtml': 'http://www.w3.org/1999/xhtml'} subject = tree.xpath('//xhtml:title', namespaces=ns)[0].text metadata_nodes = tree.xpath('//xhtml:meta', namespaces=ns) metadata_nodes = [n for n in metadata_nodes if 'name' in n.attrib] metadata = {} for node in metadata_nodes: metadata[node.attrib['name']] = node.attrib['content'] for n in metadata_nodes: n.getparent().remove(n) content = etree.tostring(tree, pretty_print=True, encoding=unicode) sender = metadata.get('mail-sender', u'') to_recipients_txt = metadata.get('mail-to-recipients', u'') cc_recipients_txt = metadata.get('mail-cc-recipients', u'') bcc_recipients_txt = metadata.get('mail-bcc-recipients', u'') to_recipients = filter(None, re.split(r'\s*,\s*', to_recipients_txt)) cc_recipients = filter(None, re.split(r'\s*,\s*', cc_recipients_txt)) bcc_recipients = filter(None, re.split(r'\s*,\s*', bcc_recipients_txt)) return content, subject, sender, to_recipients, cc_recipients, bcc_recipients
def __init__(self, filename, myopen=open, swapYZ=False): super(MeshPLY,self).__init__() with myopen(filename, "r") as f: assert f.readline().strip() == "ply" assert f.readline().strip().startswith("format ascii") elementCounts = [] while True: line = f.readline().strip() if line == "end_header": break args = re.split("\\s+",line) if len(args) >= 3 and args[0] == 'element': elementCounts.append((args[1],int(args[2]))) assert len(elementCounts) >= 2 for element,count in elementCounts: for i in range(count): line = f.readline().strip() if element == 'vertex': args = re.split("\\s+",line) if swapYZ: v = V3(float(args[0]),float(args[2]),-float(args[1])) else: v = V3(float(args[0]),float(args[1]),float(args[2])) self.vertices.append(v) elif element == 'face': args = re.split("\\s+",line) count = int(args.pop(0)) v = tuple(int(args[j]) for j in range(count)) self.faces.append((0,v)) assert self.vertices assert self.faces
def parse_range_string(input_lines): ip_range_list = [] ip_lines_list = re.split("\r|\n", input_lines) for raw_line in ip_lines_list: raw_s = raw_line.split("#") context_line = raw_s[0] context_line = context_line.replace(' ', '') ips = re.split(",|\|", context_line) for line in ips: if len(line) == 0: #print "non line:", line continue begin, end = ip_utils.split_ip(line) if ip_utils.check_ip_valid(begin) == 0 or ip_utils.check_ip_valid(end) == 0: print("ip format is error,line:%s, begin: %s,end: %s" % (line, begin, end)) continue nbegin = ip_utils.ip_string_to_num(begin) nend = ip_utils.ip_string_to_num(end) ip_range_list.append([nbegin,nend]) #print begin, end ip_range_list.sort() return ip_range_list
def tokenize(lines): tokens = [] strings = [] functions = {} new_lines = '' for i, line in enumerate(lines): line = re.sub(r'#.*$', "", line) line = re.sub('\n', ' ', line) line = re.sub('\t', '', line) line = re.split('\'', line) for j, c in enumerate(line): if j % 2 == 0: new_lines += c else: strings.append(c) new_lines += 'string ' + str(len(strings) - 1) new_lines = re.split(';', new_lines) for i, token in enumerate(new_lines): if token != '' and token != ' ' and token != '\t': token = token.strip() token = re.split(' ', token) if i % 2 != 0: functions[token[0]] = token[1:] else: tokens += token tokens = substitute_tokens(tokens) return [tokens, strings, functions]
def save_config(self, data): """Save changes to the configuration table.""" cursor = self.db_conn.cursor() cursor.execute('''INSERT INTO configs VALUES (?, ?, ?, ?, ?, ?)''', (data[0], data[1], data[2], data[3], data[6], data[7],)) self.db_conn.commit() cursor.close() if type(data[4]) is str: channels = re.split(',? ', data[4]) else: channels = data[4] if type(data[5]) is str: botops = re.split(',? ', data[5]) else: botops = data[5] cursor = self.db_conn.cursor() if channels != ['']: for chan in channels: if chan[0] != "#": chan = "#" + chan cursor.execute('''INSERT INTO channels VALUES (NULL, ?, 0, ?)''', (chan, data[0])) self.db_conn.commit() cursor.close() cursor = self.db_conn.cursor() if botops != ['']: for op in botops: cursor.execute('''INSERT INTO users VALUES (NULL, ?, NULL, NULL, 1, ?)''', (op, data[0])) self.db_conn.commit() cursor.close()
def scrape_and_look_for_next_link(url): html = scraperwiki.scrape(url) #print html root = lxml.html.fromstring(html) soup = BeautifulSoup(html) #using BeautifulSoup to find next page links scrape_table(root) #before carrying on scrape the hrefs using the scrape_table function #print soup items = soup.findAll('a',title="Next page") # findAll "next page" links if items: # if there is a next page link continue next_link = root.cssselect("div.srch-Page.srch-Page-bg a") #print next_link if next_link: next_link2 = next_link[2].attrib['href'] #print next_link2 split_link = re.split("\)+",next_link2) split_link2 = re.split("\=+",split_link[0]) split_link3 = re.split("\'+",split_link2[2]) #print split_link3[0] #print split_link2 #if split_link ==11: next_url = nextlink_url+split_link3[0] if next_url: print next_url scrape_and_look_for_next_link(next_url)
def create_new(self): """Create a new configuration.""" verify = '' while verify != 'y': print('\n') name = "" while name == "": name = input("Unique name for this configuration: ") cursor = self.db_conn.cursor() cursor.execute('''SELECT * FROM configs WHERE name = ?''', (name,)) data = cursor.fetchone() cursor.close() if data: print('The name "{0}" is not unique.'.format(name)) name = "" nick = self.prompt("Nick", "GorillaBot") realname = self.prompt("Ident", "GorillaBot") ident = self.prompt("Realname", "GorillaBot") chans = self.prompt("Channel(s)") botop = self.prompt("Bot operator(s)", '') password = self.prompt("Server password (optional)", hidden=True) youtube = self.prompt("YouTube API key (optional)", hidden=True) chans = re.split(',? ', chans) botop = re.split(',? ', botop) self.display((name, nick, realname, ident, password, youtube), chans, botop) verify = input('Is this configuration correct? [y/n]: ').lower() self.save_config((name, nick, realname, ident, chans, botop, password, youtube)) return name
def verify(self, data, chans, botops): """Verify a configuration, and make changes if needed.""" verify = input('Is this configuration correct? [y/n]: ').lower() if verify == 'y': return else: verify = '' while verify != 'y': print('\n') name = data[0] nick = self.prompt("Nick", data[1]) realname = self.prompt("Ident", data[2]) ident = self.prompt("Realname", data[3]) chans = self.prompt("Chans", ", ".join(chans)) botop = self.prompt("Bot operator(s)", ", ".join(botops)) password = self.prompt("Server password (optional)", hidden=True) youtube = self.prompt("YouTube API key (optional)", hidden=True) chans = re.split(',? ', chans) botop = re.split(',? ', botop) self.display((name, nick, realname, ident, password, youtube), chans, botop) verify = input('Is this configuration correct? [y/n]: ').lower() self.delete(name) cursor = self.db_conn.cursor() cursor.execute('''DELETE FROM channels WHERE config = ?''', (name,)) cursor.execute('''DELETE FROM users WHERE config = ?''', (name,)) self.db_conn.commit() cursor.close() self.save_config((name, nick, realname, ident, chans, botop, password, youtube))
def _parse_meta(fname): """Get the metadata as a dict out of the mitGCM mds .meta file.""" flds = {} basename = re.match("(^.+?)\..+", os.path.basename(fname)).groups()[0] flds["basename"] = basename with open(fname) as f: text = f.read() # split into items for item in re.split(";", text): # remove whitespace at beginning item = re.sub("^\s+", "", item) # match = re.match('(\w+) = ', item) match = re.match("(\w+) = (\[|\{)(.*)(\]|\})", item, re.DOTALL) if match: key, _, value, _ = match.groups() # remove more whitespace value = re.sub("^\s+", "", value) value = re.sub("\s+$", "", value) # print key,':', value flds[key] = value # now check the needed things are there needed_keys = ["dimList", "nDims", "nrecords", "dataprec"] for k in needed_keys: assert k in flds # transform datatypes flds["nDims"] = int(flds["nDims"]) flds["nrecords"] = int(flds["nrecords"]) # use big endian always flds["dataprec"] = np.dtype(re.sub("'", "", flds["dataprec"])).newbyteorder(">") flds["dimList"] = [[int(h) for h in re.split(",", g)] for g in re.split(",\n", flds["dimList"])] if "fldList" in flds: flds["fldList"] = [re.match("'*(\w+)", g).groups()[0] for g in re.split("'\s+'", flds["fldList"])] assert flds["nrecords"] == len(flds["fldList"]) return flds
def update_index_html(dest_dir, sectnum): # Process index.html separately from the modules files with open(dest_dir + 'index.html', 'r') as index_html_file: index_html = index_html_file.readlines() for line_num, line in enumerate(index_html): #inject css rule to remove haiku's orange bullets if '</head>' in line: index_html[line_num] = line.replace('</head>','<style>\nul li {\n\tbackground: none;\n\tlist-style-type: none;\n}\n</style>\n</head>') elif 'class="section"' in line: sectnum += 1 elif 'RegisterBook' in line: #remove registerbook page from TOC index_html[line_num] = '' elif 'hide-from-toc' in line: #remove stub chapter title if '<h1>' in index_html[line_num-1]: index_html[line_num-1] = '' elif 'class="toctree-l' in line and 'Gradebook' not in line and 'TODO List' not in line: title = re.split('>', re.split('</a>', line, re.IGNORECASE)[0], re.IGNORECASE)[-1] new_title = '%s.' % sectnum + title index_html[line_num] = line.replace(title, new_title) # Write the modified contents back to index.html with open(dest_dir + 'index.html', 'wb') as index_html_file: index_html_file.writelines(index_html)
def _parse_taxon_from_line(self, line, line_index): if self.strict: seq_label = line[:10].strip() line = line[10:] else: if self.multispace_delimiter: parts = re.split('[ \t]{2,}', line, maxsplit=1) else: parts = re.split('[ \t]{1,}', line, maxsplit=1) seq_label = parts[0] if len(parts) < 2: line = '' else: line = parts[1] seq_label = seq_label.strip() if not seq_label: raise self._data_parse_error("Expecting taxon label", line_index=line_index) if self.underscores_to_spaces: seq_label = seq_label.replace('_', ' ') current_taxon = self.char_matrix.taxon_set.require_taxon(label=seq_label) if current_taxon not in self.char_matrix: self.char_matrix[current_taxon] = dataobject.CharacterDataVector(taxon=current_taxon) else: if len(self.char_matrix[current_taxon]) >= self.nchar: raise self._data_parse_error("Cannot add characters to sequence for taxon '%s': already has declared number of characters (%d)" \ % (current_taxon.label, self.char_matrix[current_taxon]), line_index=line_index) return current_taxon, line
def compare_time(start, end): """ <Purpose> Manually compares two times. Returns True if end time is more recent than start time. Returns False otherwise. <Arguments> start time end time <Exceptions> None <Returns> Bool """ s = re.split('-|\+|:| ', start) e = re.split('-|\+|:| ', end) if s[0] > e[0]: return False if s[1] > e[1]: return False if s[2] > e[2]: return False if s[3] > e[3]: return False if s[4] > e[4]: return False if s[5] > e[5]: return False return True
def __call__(self, source, output_file, contexts_path): self.output = output_file blocks = re.split("^((?:<<<<<<<|>>>>>>>)[^\n]*\n)", source, flags=re.MULTILINE) in_conflict = False for index, block in enumerate(blocks): if (index & 1) == 0: if in_conflict: blocks = re.split("^(=======[^\n]*\n)", block, flags=re.MULTILINE) else: blocks = [block] for index, block in enumerate(blocks): if (index & 1) == 0: if block: for token, value in self.lexer.get_tokens(block): self.highlightToken(token, value) else: assert block[0] == "=" self.output.write(htmlutils.htmlify(block)) else: assert block[0] == "<" or block[0] == ">" self.output.write(htmlutils.htmlify(block)) in_conflict = block[0] == "<"
def addresses(filename, with_subnetsize=None): """find ip addresses configured on all interfaces from filename and return dict with interface=>(ip=>address, ipv6=>address)""" parseresult = filterConfig(filename, "interface", "^interface|^ip address|^ipv6 address") ret = dict() for sec in parseresult: intret = "" for line in sec: reobj = re.match("interface (.*)", line) if reobj: intret = reobj.group(1) if intret: # FIXME: exclude interfaces with shutdown configured reobj = re.match("(ip|ipv6) address (.*)", line) if reobj: afi = reobj.group(1) if afi == "ip" and with_subnetsize: ip = reobj.group(2).split(" ")[0] if ipaddr.IPAddress(ip).version is not 4: continue hostmask = reobj.group(2).split(" ")[1] address = str(ipaddr.IPv4Network(ip + "/" + hostmask)) elif afi == "ipv6" and with_subnetsize: address = re.split('[ ]', reobj.group(2))[0] else: address = re.split('[\/ ]', reobj.group(2))[0] if not intret in ret: ret[intret] = dict() ret[intret].update({afi: address}) return ret
def main(): parser = ArgumentParser(description="", formatter_class=RawDescriptionHelpFormatter, add_help=True) parser.add_argument("--data-directory", dest="data_directory", default=None, help="path to directory containing the source instance data to use") parser.add_argument("--output-directory", dest="output_directory", default=None, help="path to directory for all of the output instance data") parser.add_argument("--duplicates-file", dest="duplicates_file", default=None, help="path to file containing list of duplicate instance data, rows of <shasum> <count> <instance1> <instance2> ...") args = parser.parse_args() # organize all of the duplicate information # rows are in the format <shasum> <instance count> <instance 1> <instance 2> instances_to_duplicates = {} instance_keys_to_paths = {} with open(args.duplicates_file, 'r') as f: for line in f: line = line.lstrip().rstrip() components = line.split(' ') shasum = components[0] count = int(components[1]) instances = components[2:] instance_keys = [re.split('\.[a-z]+$', os.path.basename(x))[0] for x in instances] for key,path in zip(instance_keys, instances): instance_keys_to_paths[key] = path for key in instance_keys: remaining = list(instance_keys) remaining.remove(key) instances_to_duplicates[key] = remaining for instance_data in os.listdir(args.data_directory): instance_components = re.split('\.([a-z]+)$', instance_data) instance_key = instance_components[0] instance_extension = instance_components[1] # copy the instance data, then copy it to its duplicate keys if needed instance_path = "{}/{}".format(args.data_directory, instance_data) shutil.copy(instance_path, args.output_directory) if instance_key in instances_to_duplicates: for dupe in instances_to_duplicates[instance_key]: dupe_filename = "{}.{}".format(dupe, instance_extension) source = instance_keys_to_paths[instance_key] dest = instance_keys_to_paths[dupe] prefix = os.path.commonprefix([source, dest]) source_suffix = source.replace(prefix, '') dest_suffix = dest.replace(prefix, '') # modify the content to contain the right file. with open(instance_path, 'r') as source_file: with open("{}/{}".format(args.output_directory, dupe_filename), 'w') as dest_file: for line in source_file: modified_line = line.rstrip().replace(source_suffix, dest_suffix) print(modified_line, file=dest_file)
def __init__(self, jsFileString, settings, tabCharacter): self.jsFileString = jsFileString self.settings = settings self.tabCharacter = tabCharacter pattern = r'(define|require)\s*\(\s*\[(.*?)\]\s*?,\s*?function\s*?\((.*?)\)' self.requireMatch = re.search(pattern, jsFileString, flags=re.MULTILINE | re.DOTALL) if (self.requireMatch != None and len(self.requireMatch.groups()) == self.NUM_GROUPS ): def removeQuotes(s): return s.replace('"', '').replace("'", "") pathsGroupString = str(self.requireMatch.group(self.PATHS_GROUP)) pathsGroupString = pathsGroupString.strip(' \t\n') splitPaths = re.split('[\s\n]*,[\s\n]*', pathsGroupString) self.paths = list(map(removeQuotes, splitPaths)) self.args = re.split('[\s\n]*,[\s\n]*', str(self.requireMatch.group(self.ARGS_GROUP)).strip(' \t\n')) if len(self.paths) > 0 and len(self.paths[0]) == 0: self.paths = [] if len(self.args) > 0 and len(self.args[0]) == 0: self.args = [] else: self.path = None self.args = None
def cmpAlphaNum(str1,str2): str1=str1.lower() str2=str2.lower() ReSplit='(\d+)' str1=re.split(ReSplit,str1) str2=re.split(ReSplit,str2) if( ''==str1[0] ): str1.remove('') if( ''==str1[len(str1)-1] ): str1.remove('') if( ''==str2[0] ): str2.remove('') if( ''==str2[len(str2)-1] ): str2.remove('') for i in range( min( len(str1),len(str2) ) ): try: tmp=int(str1[i]) str1[i]=tmp except:ValueError try: tmp=int(str2[i]) str2[i]=tmp except:ValueError if( str1[i]==str2[i] ): continue if (str1[i]>str2[i]): return 1 else: return -1 return cmp(len(str1),len(str2))
def extractValues (self, line): parts = re.split(':', line) raw_values = re.split(',', parts[1]) values = [] for rv in raw_values: values.append(self.cleanValue(rv)) return values
def epg_list(self): try: now = datetime.datetime.now() now = '%04d' % now.year + '%02d' % now.month + '%02d' % now.day + '%02d' % now.hour + '%02d' % now.minute + '%02d' % now.second file = open(addonEPG,'r') read = file.read() file.close() programmes = re.compile('(<programme.+?</programme>)').findall(read) except: return for programme in programmes: try: start = re.compile('start="(.+?)"').findall(programme)[0] start = re.split('\s+', start)[0] stop = re.compile('stop="(.+?)"').findall(programme)[0] stop = re.split('\s+', stop)[0] if not int(start) <= int(now) <= int(stop): raise Exception() channel = common.parseDOM(programme, "programme", ret="channel")[0] title = common.parseDOM(programme, "title")[0] title = common.replaceHTMLCodes(title).encode('utf-8') desc = common.parseDOM(programme, "desc")[0] desc = common.replaceHTMLCodes(desc).encode('utf-8') epg = "[B][%s] - %s[/B]\n%s" % ('ÔÙÑÁ'.decode('iso-8859-7').encode('utf-8'), title, desc) self.epg.update({channel: epg}) except: pass
def __init__(self, gtf_line): self.gtf_list = gtf_line self.seqname, self.source, self.feature, self.start, self.end, self.score, self.strand, self.frame, self.attribute = gtf_line # These indexes are defined by the GTF spec tmp = map(lambda x: re.split('\s+', x.replace('"', '')), re.split('\s*;\s*', self.attribute.strip().strip(';'))) self.attribute = dict([x for x in tmp if len(x)==2]) # convert attrs to dict self.start, self.end = int(self.start) - 1, int(self.end)
def GetRestaurantGrid(d, zip): br.select_form("Form1") br.set_all_readonly(False) dt = 'dgResults$ctl' + str(d) + '$ctl00' # print dt br["__EVENTTARGET"] = dt br["__EVENTARGUMENT"] = '' request = br.click() response1 = br1.open(request) # find the window open hidden in the script html1 = response1.read() # print html1 root1 = lxml.html.fromstring(html1) rest_name = root1.cssselect("span#lblName")[0].text rest_address = root1.cssselect("span#lblAddress")[0].text cityStateZip = root1.cssselect("span#lblCityStateZip")[0].text city = re.split(",", cityStateZip)[0] rest_inspectionDate = root1.cssselect("span#lblLastInspection")[0].text if rest_inspectionDate == " ": date = "" else: date = re.split(":", rest_inspectionDate)[1].strip() violations = parseViolations(html1) # print violations scraperwiki.sqlite.save(unique_keys=["dt"], data={"dt": dt + "_" + zip + "_" + str(datetime.date.today()), "name": rest_name, "address": rest_address, "city": city, "state":"NY", "zip": zip, "inspection_date": date, "violations": violations, "time_scraped":datetime.datetime.now(), "page_id" : dt})
def ReadCropAttrs(cropFile): if not os.path.exists(cropFile): cropFile = TXT_DB_DIR + os.sep + CROP_FILE f = open(cropFile) lines = f.readlines() f.close() attrDic = {} fields = [item.replace('"', '') for item in re.split('\t|\n', lines[0]) if item is not ''] n = len(fields) for i in xrange(n): attrDic[fields[i]] = {} for line in lines[2:]: items = [item.replace('"', '') for item in re.split('\t', line) if item is not ''] id = int(items[0]) for i in xrange(n): dic = attrDic[fields[i]] try: dic[id] = float(items[i]) except: dic[id] = items[i] return attrDic
def find_time_interval(fits): """ find time interval of the fits file input: fits --- fits file name output: [tmin, tmax] --- start and stop time in seconds from 1998.1.1 """ cmd = 'dmstat "' + fits + '[cols time]" centroid=no >' + zspace scf.run_ascds(cmd) out = scf.read_file(zspace, remove=1) chk = 0 for val in out: mc1 = re.search('min', val) mc2 = re.search('max', val) if mc1 is not None: atemp = re.split('\s+', val) tmin = int(float(atemp[1])) chk += 1 elif mc2 is not None: atemp = re.split('\s+', val) tmax = int(float(atemp[1])) chk += 1 if chk > 1: break return [tmin, tmax]
def split_string_with_lines(string, indentation = "", chars_per_line = 100): # expert splitting mode matches = re.split(''',(?=(?:[^'"]|'[^']*'|"[^"]*")*$)''', string) splitted = [] for s in matches: splitted.append( s + ("," if s != matches[-1] else "")) res = [] buf = "" for s in splitted: if len(s) > chars_per_line: splitted2 = re.split(''' (?=(?:[^'"]|'[^']*'|"[^"]*")*$)''', s) for s2 in splitted2: ext = s2 if s2 == splitted2[-2]: if len(splitted2[-1]) <= 5: ext += " " + splitted2[-1] buf += ext + (" " if s2 != splitted2[-1] and ext == s2 else "") if len(buf) >= chars_per_line or s2 == splitted2[-1]: res.append(buf) buf = "" if ext != s2: break else: buf += s if len(buf) >= chars_per_line or s == splitted[-1]: res.append(buf) buf = "" return ("\n%s"%indentation).join( res ), len( res )
def parse_relationship(expression): """ Parse a relationship expression containing a package name and (optionally) a version relation of the form ``python (>= 2.6)``. Raises :py:exc:`ValueError` when parsing fails. An example: >>> from deb_pkg_tools.deps import parse_relationship >>> parse_relationship('python') Relationship(name='python') >>> parse_relationship('python (<< 3)') VersionedRelationship(name='python', operator='<<', version='3') :param expression: A relationship expression (a string). :returns: A :py:class:`Relationship` object. """ tokens = [t.strip() for t in re.split('[()]', expression) if t and not t.isspace()] if len(tokens) == 1: # Just a package name (no version information). return Relationship(tokens[0]) elif len(tokens) != 2: # Encountered something unexpected! msg = "Corrupt package relationship expression: Splitting name from relationship resulted in more than two tokens! (expression: %r, tokens: %r)" raise ValueError(msg % (expression, tokens)) else: # Package name followed by relationship to specific version(s) of package. name, relationship = tokens tokens = [t.strip() for t in re.split('([<>=]+)', relationship) if t and not t.isspace()] if len(tokens) != 2: # Encountered something unexpected! msg = "Corrupt package relationship expression: Splitting operator from version resulted in more than two tokens! (expression: %r, tokens: %r)" raise ValueError(msg % (relationship, tokens)) return VersionedRelationship(name, *tokens)
import pickle import re # import jieba # import jieba.analyse import operator data = [] with open("./newsData.pkl", 'rb') as fr: data = pickle.load(fr) likes = dict() posts = dict() for i in range(len(data)): authors = re.split('、|,' ,data[i][1].replace(" ","")) likesCnt = data[i][2] for author in authors: try: likes[author] += int(likesCnt) posts[author] += 1 except: likes[author] = int(likesCnt) posts[author] = 1 authorAvgLikes = dict() for author in likes: authorAvgLikes[author] = int(likes[author]/posts[author]) print(sorted(authorAvgLikes.items(), key=operator.itemgetter(1))) # allTag = [] # for i in range(1, len(data)): # allTag += jieba.analyse.extract_tags(data[i][1],topK=3)
dfs_2(each_vertex) if counter > max_scc[4]: max_scc[4] = counter max_scc.sort(reverse = True) def dfs_2(vertex): global counter leaders[vertex] = current_source counter += 1 for each_vertex in adjacent_list[vertex]: if leaders[each_vertex] < 0: dfs_2(each_vertex) input_file = open("SCC.txt") for each_line in input_file: vertex1, vertex2 = re.split('[ \t\n\r]', each_line.strip()) vertex1 = int(vertex1) - 1 vertex2 = int(vertex2) - 1 reversed_adjacent_list[vertex2].append(vertex1) input_file.close() dfs_loop_1() del reversed_adjacent_list input_file = open("SCC.txt") for each_line in input_file: vertex1, vertex2 = re.split('[ \t\n\r]', each_line.strip()) vertex1 = int(vertex1) - 1 vertex2 = int(vertex2) - 1 adjacent_list[vertex1].append(vertex2) input_file.close() leaders = [-1 for i in range(MAX)] dfs_loop_2()
print(l); print(list("%d" % x for x in range(1, 10))); print("Hi {0}, 成绩提高了{1:.1f}%".format("小明", 1.254)); print("Hi {0}, 成绩提高了{1}%".format("小明", 1.254)); print("Hi {0}, 成绩提高了{1}%".format("小明", "%.1f"%1.254)); print("=".join(["cdsac","cdsa","dewqd"])); # ==== 正则表达式 ==== email_re = "^[\w-]+(\.[\w-]+)*@[\w-]+(\.[\w-]+)+$"; if re.match(email_re, "hujiangyx163.com"): print("ok"); else: print("error"); # ---- 切分字符串 ---- print("a b c".split(" ")); print(re.split(r'\s+',"a b c")); print(re.split(r"[\s\,\;]+", "a,b;; c d")); # ---- 分组 ---- match = re.match(r'^(\d{3})-(\d{3,8})$', "020-123456") print(match.group()); print(match.group(0)); print(match.group(1)); print(match.group(2)); new_line = r'截至9月2日0时,全省累计报告新型冠状病毒肺炎确诊病例653例(其中境外输入112例),' \ r'累计治愈出院626例,死亡3例,目前在院隔离治疗24例,964人尚在接受医学观察'; new_line_re = r'^截至9月2日0时,全省累计报告新型冠状病毒肺炎确诊病例(\d+)例\(其中境外输入(\d+)例\),' \ r'累计治愈出院(\d+)例,死亡(\d+)例,目前在院隔离治疗(\d+)例,(\d+)人尚在接受医学观察$'; new_line_math = re.match(new_line_re, new_line); print(new_line_math.group(0)); print(new_line_math.group(1)); print(new_line_math.group(2));
def analizar(self): self.texto = "" arr = re.split("\n", self.fila.texto) for (linea) in arr: self.texto += mejorar_links(linea)
return self._stemmer.stem(word).lower() grail = nltk.corpus.webtext.words('grail.txt') text = IndexedText(porter, grail) text.concordance('lie') wnl = nltk.WordNetLemmatizer() [wnl.lemmatize(t) for t in tokens] raw = """'When I'M a Duchess,' she said to herself, (not in a very hopeful tone though), 'I won't have any pepper in my kitchen AT ALL. Soup does very well without--Maybe it's always pepper that makes people hot-tempered,'...""" import re re.split(r' ', raw) re.split(r'[ \t\n]+', raw) re.split(r'\s+', raw) re.split(r'\W+', raw) re.findall(r'\w+|\S\w*', raw) text = 'That U.S.A. poster-print costs $12.40...' pattern = r'''(?x)([A-Z]\.)+| \w+(-\w+)*| \$?\d+(\.\d+)?%?| \.\.\.| [][.,;"'?():-_`]''' nltk.regexp_tokenize(text, pattern) fdist = nltk.FreqDist( ['dog', 'cat', 'dog', 'cat', 'dog', 'snake', 'dog', 'cat']) for word in sorted(fdist): print(word, ":", fdist[word], end='; ')
def dir_path_handle(path_string, name_handle_func): name_list = re.split(r'[\\/]', path_string) crypto_list = [name_handle_func(s) for s in name_list] return '/'.join(crypto_list)
def get_active_zone_set(self): """Return the active zone configuration. Return active zoneset from fabric. When none of the configurations are active then it will return empty map. :returns: Map -- active zone set map in the following format .. code-block:: python { 'zones': {'openstack50060b0000c26604201900051ee8e329': ['50060b0000c26604', '201900051ee8e329'] }, 'active_zone_config': 'OpenStack_Cfg' } """ zone_set = {} zone = {} zone_member = None zone_name = None switch_data = None zone_set_name = None try: switch_data = self._get_switch_info( [zone_constant.GET_ACTIVE_ZONE_CFG]) except exception.BrocadeZoningCliException: with excutils.save_and_reraise_exception(): LOG.error( _LE("Failed getting active zone set " "from fabric %s"), self.switch_ip) try: for line in switch_data: line_split = re.split('\\t', line) if len(line_split) > 2: line_split = [x.replace('\n', '') for x in line_split] line_split = [x.replace(' ', '') for x in line_split] if zone_constant.CFG_ZONESET in line_split: zone_set_name = line_split[1] continue if line_split[1]: zone_name = line_split[1] zone[zone_name] = list() if line_split[2]: zone_member = line_split[2] zone_member_list = zone.get(zone_name) zone_member_list.append(zone_member) zone_set[zone_constant.CFG_ZONES] = zone zone_set[zone_constant.ACTIVE_ZONE_CONFIG] = zone_set_name except Exception: # In case of parsing error here, it should be malformed cli output. msg = _("Malformed zone configuration: (switch=%(switch)s " "zone_config=%(zone_config)s).") % { 'switch': self.switch_ip, 'zone_config': switch_data } LOG.exception(msg) raise exception.FCZoneDriverException(reason=msg) switch_data = None return zone_set
def __init__(self, raw_string, split_expression=r'\W+', bow=True, entity=[]): """Initializer. Args: raw_string: string with raw text in it split_expression: string will be split by this. bow: if True, a word is the same everywhere in the text - i.e. we will index multiple occurrences of the same word. If False, order matters, so that the same word will have different ids according to position. entity: list with the indices of the entity for which the explanations are required. Used to perturb the entity always as a group and don't break it apart. """ split_expression_non_vocab = r'\W+' # added for the conll data set self.raw = raw_string self.as_list = re.split(r'(%s)|$' % split_expression, self.raw) self.as_np = np.array(self.as_list) non_word = re.compile(r'(%s)|$' % split_expression_non_vocab).match self.string_start = np.hstack( ([0], np.cumsum([len(x) for x in self.as_np[:-1]]))) vocab = {} self.inverse_vocab = [] self.positions = [] self.bow = bow non_vocab = set() for i, word in enumerate(self.as_np): if word in non_vocab: continue if non_word(word): non_vocab.add(word) continue if bow: if word not in vocab: vocab[word] = len(vocab) self.inverse_vocab.append(word) self.positions.append([]) idx_word = vocab[word] self.positions[idx_word].append(i) else: self.inverse_vocab.append(word) self.positions.append(i) if not bow: self.positions = np.array(self.positions) # Get new indices for the entity (if there is one) if entity: # Calculate indices with respect to as_list encoding (only works for space as the split_expression) if split_expression == ' ': self.entity_as_list = [ele * 2 for ele in entity] else: print( 'Need to split the example per space (set split_expression == ' ' (in lime_text.py).') return # Calculate indices with respect to the vocab encoding (the one LIME calculates by removing nonvocab tokens) self.entity_as_vocab = [] for ele in self.entity_as_list: # Need to use np since positions is a numpy array idx_array = np.where( self.positions == ele) # returns a tuple (array, dtype) if idx_array[0].size != 0: self.entity_as_vocab.append(idx_array[0].item()) else: print( 'Problem finding indices of the entities (np.where in lime_text.py).' ) print('For raw string ' + raw_string) return
row_count += 1 print 'Total Tickets to be checked: ', row_count for i in range(2, row_count + 2): DL = ws.cell(row=i, column=22).value Wo_Ref = ws.cell(row=i, column=1).value print 'WO Ref taken from the corrective task excel', Wo_Ref Yesterday_Comment = [] DL1 = DL.encode("utf-8") #print type (DL1) #print DL1 #print len(DL1.splitlines()) #pattern=re.compile(r'\d+/\d+/\d+\s\d+:\d+:\d+\s[A-Z]{2}') pattern = re.compile(r'\d+:\d+:\d+\s[A-Z]{2}') DL_Filtered = re.split(pattern, DL1) #print DL_Filtered for j in DL_Filtered: if date1 in j: Yesterday_Comment.append(j.lower()) #print 'The Yesterday_Comment List: ',Yesterday_Comment #print Yesterday_Comment #print 'Length: ',len(Yesterday_Comment) Yesterday_Comment_Rev = list(Yesterday_Comment) Yesterday_Comment_Rev.reverse() #Day_DL=' ' #for i in Yesterday_Comment: # Day_DL=Day_DL+str(i)
def split_term_classifiers(line): # type: (unicode) -> List[Union[unicode, None]] # split line into a term and classifiers. if no classifier, None is used.. parts = re.split(' +: +', line) + [None] return parts
import re text = open("hindi_file.txt",encoding= 'utf-8').read() #This converts the encoded text to an internal unicode object, where # all characters are properly recognized as an entity: words = re.split(r'\s+', re.sub(r'[,/\-!?.|lIред"\]\[<>br]', ' ', text).strip()) print(words) text = ' '.join(words) fh = open("hindi.txt","w", encoding='utf-8').write(text)
#rename the tip names in a Newick tree based on a tab-delimited file linking name codes to full names from ete3 import Tree import re, sys #arg1 - tree file #arg2 - names mapping file #arg3 - output file with renamed Newick tree names = {} tree = Tree(sys.argv[1]) #Newick tree #read in a tab-delimited species names file tblfile = open(sys.argv[2]) for line in tblfile: fields = re.split("\t", line.rstrip()) names[fields[0]] = fields[1] for leaf in tree: if leaf.name in names: leaf.name = names[leaf.name] tree.write(outfile=sys.argv[3])
def load_instance(path_to_file): ### load raw text ### f = open(path_to_file, 'r') raw_lines = f.read().splitlines() stripped_lines = [] for line in raw_lines: line = line.replace('\t', ',').replace('[', '').replace(']', '') stripped_lines.append(re.split(',', line)) ### first line ### first_line = stripped_lines[0] n_activities = int( first_line[0]) # number of activities (incl. dummy activities) n_resources = int(first_line[1]) ### load tasks (main body of instance file) ### tasks = {} for activity in range(n_activities): ### first block ### line1 = stripped_lines[activity + 1] task_id = int(line1[0]) n_successors = [ int(line1[2]), int(line1[3]), int(line1[4]), int(line1[5]) ] # [# SS successors, # SF successors, # FS successors, # FF successors] successors = [[] for i in range(4)] k = 0 # counter to track where in line1 to get desired info. for i in range(4): if n_successors[i] > 0: for j in range(n_successors[i]): successors[i].append( (int(line1[6 + 2 * k + j]), int(line1[6 + 2 * k + n_successors[i] + j])) ) # e.g. successor[i=2(FS)] = [(FS successor id, min. time-lag),...] k += n_successors[i] ### second block ### line2 = stripped_lines[n_activities + activity + 1] k = int(line2[2]) # principle resource index w_k = int(line2[3]) # principle resource work-content q_min = [] # min. per-period resource allocation for each resource q_max = [] # max. per-period resource allocation for each resource for r in range(n_resources): q_min.append(int(line2[4 + 2 * r])) q_max.append(int(line2[4 + 2 * r + 1])) ### create task ### task = Task(task_id, successors, k, w_k, q_min, q_max) tasks[task_id] = task ### last line ### last_line = stripped_lines[2 * n_activities + 1] R_max = [] # resource_availabilities for r in range(n_resources): R_max.append(int(last_line[r])) l_min = int(last_line[n_resources]) # min. block length ### create project ### name = os.path.splitext(os.path.basename( os.path.normpath(path_to_file)))[0] project = Project(name, tasks, R_max, l_min) return (project)
def get_elements(self): buff = self.buff.replace("\n", " ") # multi split elements: ".", ",", ":" import re for i in re.split('; |, |-|\.|\?|:', buff): yield i
m = typedef2_pat.match(line) if m: mode = IN_ENUM decls = {} idx = 0 elif mode == FOUND_ENUM: m = openbrace_pat.match(line) if m: mode = IN_ENUM decls = {} idx = 0 else: assert False, "Invalid z3_api.h, line: %s" % linenum else: assert mode == IN_ENUM words = re.split('[^\-a-zA-Z0-9_]+', line) m = closebrace_pat.match(line) if m: name = words[1] z3consts.write('# enum %s\n' % name) for k, i in decls.iteritems(): z3consts.write('%s = %s\n' % (k, i)) z3consts.write('\n') mode = SEARCHING else: if words[2] != '': if len(words[2]) > 1 and words[2][1] == 'x': idx = int(words[2], 16) else: idx = int(words[2]) decls[words[1]] = idx
def word_count(phrase): counter = Counter() for word in split('\W+', phrase.lower()): if word == '' : continue counter[word] += 1 return counter
def gen(lang): global include, INCL_DIR print('Generating bindings for', lang) templ = template[lang] print('Generating bindings for', lang) for target in include: prefix = templ[target] outfile = open(templ['out_file'] % (prefix), 'wb') # open as binary prevents windows newlines outfile.write((templ['header'] % (prefix)).encode("utf-8")) lines = open(INCL_DIR + target).readlines() count = 0 for line in lines: line = line.strip() if line.startswith(MARKUP): # markup for comments outfile.write(("\n%s%s%s\n" %(templ['comment_open'], \ line.replace(MARKUP, ''), \ templ['comment_close']) ).encode("utf-8")) continue if line == '' or line.startswith('//'): continue if line.startswith('#define '): line = line[8:] #cut off define xline = re.split('\s+', line, 1) #split to at most 2 express if len(xline) != 2: continue if '(' in xline[0] or ')' in xline[ 0]: #does it look like a function continue xline.insert( 1, '=') # insert an = so the expression below can parse it line = ' '.join(xline) if not line.startswith(prefix.upper()): continue tmp = line.strip().split(',') for t in tmp: t = t.strip() if not t or t.startswith('//'): continue # hacky: remove type cast (uint64_t) t = t.replace('(uint64_t)', '') t = re.sub(r'\((\d+)ULL << (\d+)\)', r'\1 << \2', t) # (1ULL<<1) to 1 << 1 f = re.split('\s+', t) if f[0].startswith(prefix.upper()): if len(f) > 1 and f[1] not in ('//', '///<', '='): print("Error: Unable to convert %s" % f) continue elif len(f) > 1 and f[1] == '=': rhs = ''.join(f[2:]) else: rhs = str(count) count += 1 try: count = int(rhs) + 1 if (count == 1): outfile.write(("\n").encode("utf-8")) except ValueError: if lang == 'ocaml': # ocaml uses lsl for '<<', lor for '|' rhs = rhs.replace('<<', ' lsl ') rhs = rhs.replace('|', ' lor ') # ocaml variable has _ as prefix if rhs[0].isalpha(): rhs = '_' + rhs outfile.write((templ['line_format'] % (f[0].strip(), rhs)).encode("utf-8")) outfile.write((templ['footer']).encode("utf-8")) outfile.close()
# from sys import stdin # inFile = stdin.readlines import re inFile = open('input.txt', 'r', encoding='utf8') lines = str(inFile.readlines()) # words = lines.replace(';', ' ').split() # print(words) newLines = re.split(r', |_|-|!', lines) # print(newLines) myList = set() # count = 0 for elem in newLines: myList.add(elem) print(myList) punctuation = ['.', ',', ':', ';', '!', '?', '(', ')'] wordList = lines.split() inFile.close() # re.split('(\W+)', 'Words, words, words.')
def change_dir(command): current_path = os.getcwd() # get the current directory command, desired_dir = re.split(" ", command) # get the command and the desired dir os.chdir(desired_dir)
def natural_keys(text): return [atoi(c) for c in re.split('(\d+)', text)]
# Read policy from file policy = []; line = f.readline(); while line: line = line[0:-1]; policy.append(int(line)); line = f.readline(); # Get standardized ghost locations f = open("ghostLocs.txt","r"); line = f.readline(); all_ghost_x = []; all_ghost_y = []; while line: line = line[0:-1]; ghostx0, ghostx1, ghosty0, ghosty1 = re.split(' ',line); ghostLocX = [int(ghostx0),int(ghostx1)]; ghostLocY = [int(ghosty0), int(ghosty1)]; all_ghost_x.append(ghostLocX); all_ghost_y.append(ghostLocY); line = f.readline(); ############################################################################### ############################ Game Simulation ################################# ############################################################################### pacman_x,pacman_y = 1,3; num_ghosts = 2; #ghost_x = [2,3]; #ghost_y = [2,3]; win_count = 0
def alphanum_key(s): """ Turn a string into a list of string and number chunks. "z23a" -> ["z", 23, "a"] """ return [tryint(c) for c in re.split('([0-9]+)', s)]
def tokenize(text: str) -> List[str]: return re.split(" ", text)
elif u'’/s' not in s: return s.replace(u'‘/s ', '') else: return s texts = u''.join(map(clean, sentences)) # 把所有的词拼接起来 print 'Length of texts is %d' % len(texts) print 'Example of texts: \n', texts[:300] # file_object = open('train_clean.txt', 'w') # file_object.write(str(texts.decode('utf-8'))) # file_object.close() # 重新以标点来划分 sentences = re.split(u'[,。!?、‘’“”]/[bems]', texts) print 'Sentences number:', len(sentences) print 'Sentence Example:\n', sentences[0] def get_Xy(sentence): """将 sentence 处理成 [word1, w2, ..wn], [tag1, t2, ...tn]""" words_tags = re.findall('(.)/(.)', sentence) if words_tags: words_tags = np.asarray(words_tags) words = words_tags[:, 0] tags = words_tags[:, 1] return words, tags # 所有的字和tag分别存为 data / label return None
def _extractCoreInfo(unique_rep): ''' Return the core information of the given unique representation. ''' return re.split( ',', unique_rep[unique_rep.find('[') + 1:unique_rep.find(']')])
def greedy_decode(self, sentence, trg_gender=None, max_len=512): # vectorizing the src sentence on the char level and word level sentence = re.split(r'(\s+)', sentence) vectorized_src_sentence_char = [self.src_vocab_char.sos_idx] vectorized_src_sentence_word = [self.src_vocab_word.sos_idx] for word in sentence: for c in word: vectorized_src_sentence_char.append( self.src_vocab_char.lookup_token(c)) vectorized_src_sentence_word.append( self.src_vocab_word.lookup_token(word)) vectorized_src_sentence_word.append(self.src_vocab_word.eos_idx) vectorized_src_sentence_char.append(self.src_vocab_char.eos_idx) # getting sentence length src_sentence_length = [len(vectorized_src_sentence_char)] # vectorizing the trg gender if trg_gender: vectorized_trg_gender = self.trg_gender_vocab.lookup_token( trg_gender) vectorized_trg_gender = torch.tensor([vectorized_trg_gender], dtype=torch.long) else: vectorized_trg_gender = None # converting the lists to tensors vectorized_src_sentence_char = torch.tensor( [vectorized_src_sentence_char], dtype=torch.long) vectorized_src_sentence_word = torch.tensor( [vectorized_src_sentence_word], dtype=torch.long) src_sentence_length = torch.tensor(src_sentence_length, dtype=torch.long) # passing the src sentence to the encoder with torch.no_grad(): encoder_outputs, encoder_h_t = self.model.encoder( vectorized_src_sentence_char, vectorized_src_sentence_word, src_sentence_length) # creating attention mask attention_mask = self.model.create_mask(vectorized_src_sentence_char, self.src_vocab_char.pad_idx) # initializing the first decoder_h_t to encoder_h_t decoder_h_t = encoder_h_t context_vectors = torch.zeros(1, self.model.encoder.rnn.hidden_size * 2) # intializing the trg sequences to the <s> token trg_seqs = [self.trg_vocab_char.sos_idx] with torch.no_grad(): for i in range(max_len): y_t = torch.tensor([trg_seqs[-1]], dtype=torch.long) # do a single decoder step prediction, decoder_h_t, atten_scores, context_vectors = self.model.decoder( trg_seqs=y_t, encoder_outputs=encoder_outputs, decoder_h_t=decoder_h_t, context_vectors=context_vectors, attention_mask=attention_mask, trg_gender=vectorized_trg_gender) # getting the most probable prediciton max_pred = torch.argmax(prediction, dim=1).item() # if we reach </s> token, stop decoding if max_pred == self.trg_vocab_char.eos_idx: break trg_seqs.append(max_pred) str_sentence = self.get_str_sentence(trg_seqs, self.trg_vocab_char) return str_sentence
if not output_dirs: # Make transaction dirs if they don't exist # * I have my statements saved in sub dirs by year so this creates those output_dirs = sorted([f"{TRANSACTIONS_FOLDER}/{d}" for d in dirs]) for transaction_dir in output_dirs: if not os.path.isdir(transaction_dir): os.makedirs(transaction_dir) if files: for filename in files: path = f"{root}/{filename}" if os.path.splitext(path)[1] == ".pdf": contents = unpack.from_file(path).get("content", "") iterator = iter(re.split(f"({'|'.join(keywords)})", contents)) file_data = [] for key in iterator: if key in keywords: try: value = next(iterator) if key == TRANSACTIONS_HEADER: # Split by the date format: "Jan 1, 1970" # or 2 new lines split = re.split( r"(\w{3} \d{1,2}, 20\d{2})|\n\n", value,
def clean_optical_configs(self): ocs = self.cleaned_data["optical_configs"] cleaned = [] namestore = [] if self.instance: namestore.extend( [oc.name for oc in self.instance.optical_configs.all()]) # on update form allow for the same name (case insensitive) brackets = re.compile(r"[\[\]\{\}]") def _getpromise(fname): fp = FilterPromise(fname) if fp.is_valid: return fp else: self.add_error( "optical_configs", "Filter not found in database or at Chroma/Semrock: " "{}".format(fname), ) return None def lookup(fname, n=None): # lookup filter name in database, then check on chroma/semrock if not fname: return None if isinstance(fname, str) and fname.isdigit(): if n in (2, 3): self.add_error( "optical_configs", 'Laser line (integers) are only accepted in the second position (err: "%s" in position %d)' % (fname, n + 1), ) elif int(fname) < 300 or int(fname) > 1600: self.add_error( "optical_configs", "Laser wavelengths must be between 300-1600. Got: %s" % fname, ) else: return int(fname) try: return Filter.objects.get(name__icontains=fname) except MultipleObjectsReturned: try: return Filter.objects.get(part__iexact=fname) except ObjectDoesNotExist: return _getpromise(fname) except ObjectDoesNotExist: return _getpromise(fname) return None for linenum, line in enumerate(ocs.splitlines()): try: if not line: continue try: if (line.index("{") < line.index(",")) or ( line.index("}") < line.index(",")): self.add_error( "optical_configs", "No curly braces allowed in name (line #{})". format(linenum + 1), ) continue except Exception: pass _out = [] if brackets.search(line): _splt = [ i.strip() for i in re.split(r"({[^}]*})", line) if i.strip() ] splt = [] for item in _splt: if brackets.search(item): splt.append([ n.strip() for n in brackets.sub("", item).split(",") if n.strip() ]) else: if item.endswith(","): item = item[:-1] if item.startswith(","): item = item[1:] splt.extend([n.strip() for n in item.split(",")]) else: splt = [i.strip() for i in line.split(",")] if not len(splt) in (4, 5): self.add_error( "optical_configs", "Lines must have 4 or 5 comma-separated fields but this one " "has {}: {}".format(len(splt), line), ) for n, f in enumerate(splt): if n == 0: if f in namestore: self.add_error( "optical_configs", "Optical config with the name %s already exists." % f, ) else: namestore.append(f) _out.append(f) elif n == 4: try: if f.lower() in ("0", "false", "none"): _out.append(False) else: _out.append(True) except Exception: self.add_error( "optical_configs", "Unable to parse Boolean in position 5: %s" % f, ) else: if isinstance(f, list): _out.append([lookup(x, n) for x in f]) else: _out.append(lookup(f, n)) cleaned.append(_out) except Exception: self.add_error( "optical_configs", "Uknown error parsing line #{}: {}".format( linenum + 1, line), ) return cleaned
def search(path_to_index): stemmer = nltk.stem.SnowballStemmer('english') stop_words = {} reg = re.compile("\"|,| ") stop_file = open("stop_words.txt", "r") content = stop_file.read() content = re.split(reg, content) for word in content : if word : stop_words[word] = True title_tags = open(path_to_index+"/title_tags.txt", "r") title_position = pickle.load(open(path_to_index+"/title_positions.pickle", "rb")) word_position = pickle.load(open(path_to_index+"/word_positions.pickle", "rb")) field_map = {"t" : 0, "b" : 1, "i" : 2, "c" : 3} field_chars = ["t", "b", "i", "c"] files = [] for f in field_chars : file = path_to_index+ "/" + f + ".txt" fp = open(file, "r") files.append(fp) # final_result = [] while(1) : query = input() # print(query) start = time.time() result = [] documents = dict() query_words = list() # query = query.lower().strip() # start = time.time() # if (query == "exit") : # break if ":" in query : query_bag = query.split(" ") t_result=list() flag2=0 for q in query_bag : field_query = q.split(":") field = field_query[0] query = field_query[1] field = mapping_shortform(field) query_words = query.split() for word in query_words : word = stemmer.stem(word) if word in word_position and field in word_position[word] : position = word_position[word][field] files[field_map[field]].seek(position) intersection=list() s = files[field_map[field]].readline()[:-1] # remove "/n" [:-1] & read full line of posting list if "," in s : items = s.split(",") for item in items : document_score = item.split(":") doc_id = document_score[0] score = document_score[1] tt = 1 if doc_id in documents : documents[doc_id] = documents[doc_id] + float(score) else : documents[doc_id] = float(score) else : document_score = item.split(":") doc_id = document_score[0] score = document_score[1] tt = 1 union_list = list() if doc_id in documents : documents[doc_id] = documents[doc_id] + float(score) else : documents[doc_id] = float(score) else : query_bag = query.split() length = len(query_bag) for i in range(length) : query_bag[i] = stemmer.stem(query_bag[i]) for word in query_bag : if word not in stop_words and word in word_position: query_words.append(word) for word in query_words : docs = list() flag2=0 positions = word_position[word] for field in positions.keys() : position = positions[field] intersection=list() files[field_map[field]].seek(position) s = files[field_map[field]].readline()[: -1] if "," in s : items = s.split(",") for item in items : document_score = item.split(":") doc_id = document_score[0] score = document_score[1] tt = 1 if doc_id in documents : documents[doc_id] = documents[doc_id] + float(score) else : documents[doc_id] = float(score) else : document_score = item.split(":") doc_id = document_score[0] score = document_score[1] tt = 1 union_list = list() if doc_id in documents : documents[doc_id] = documents[doc_id] + float(score) else: documents[doc_id] = float(score) documents = sorted(documents.items(), key = operator.itemgetter(1), reverse = True) count = 1 end = time.time() print("Response Time : " + str(end - start) + " s\n") for document in documents : position = title_position[int(document[0]) - 1] title_tags.seek(position) title = title_tags.readline()[: -1] result.append(title) print(title) count += 1 if count > 10 : break print("\n")
def natural_sort(l): convert = lambda text: int(text) if text.isdigit() else text.lower() alphanum_key = lambda key: [ convert(c) for c in re.split('([0-9]+)', key) ] return sorted(l, key = alphanum_key)