def acroize_heading(m): acro = text.get('acronym') if not acro: return m[0] heading = m[2] if not heading: return acro m2 = regex.match(r'(\d+(?:–(\d+))?)(?:\.)?\s*(.*)$', heading) if not m2: h_text = heading else: h_num = m2[1] h_text = m2[3] m3 = regex.match(r'(.*?)(\d+(?:–(\d+))?)$', text['acronym']) acro_prefix = m3[1] acro_num = m3[2] if acro_num == h_num: heading = h_text elif '–' in acro_num and h_num: acro = acro_prefix + h_num heading = h_text new_heading = f'<span class="acro">{acro}</span>{": " if h_text else ""}{h_text}' return f'{m[1]}{new_heading}'
def test_yaml(md_filepath): filestring = md_filepath.read() reg = regex.compile(r'^---(.*?)---',flags=regex.DOTALL) match = regex.search(reg, filestring) if not match: pytest.skip('No YAML header') yaml_text = match.group(1) parsed_yaml = yaml.load(yaml_text) for requirement in requirements: req = requirements[requirement] if req['required']: assert requirement in parsed_yaml, 'YAML metadata missing required element: ' + requirement if req['type'] == 'link': # Check external links have balanced brackets regexp = regex.compile(r'\[(.*)\]\((.*)\)') assert regex.match(regexp,parsed_yaml[requirement]), 'YAML metadata formatting error: ' + requirement if req['type'] == 'date' and requirement in parsed_yaml: try: d = parse(str(parsed_yaml[requirement])) except ValueError: assert False, 'YAML metadata formatting error: ' + requirement + ' date parse failed.' regexp = regex.compile(r'20[0-9]{2}-[0-9]{2}-[0-9]{2}') assert regex.match(regexp,str(parsed_yaml[requirement])), 'YAML metadata formatting error: ' + requirement + ' should use the format YYYY-MM-DD.' for header in parsed_yaml: assert header in requirements, 'YAML metadata header ' + header + ' is not a valid metadata type.'
def get_next_document(h): while True: l = h.readline() if not l: doc = None break l = l.decode('utf-8').strip() if not l: continue if re.match(u'^<doc ', l, re.UNICODE): # Fix _unk_. l = re.sub(r'_unk_', 'unknown', l) # Forum detection. if re.match(RE_FORUM, l, re.UNICODE): l = re.sub(u'>$', r' forum="1">', l, re.UNICODE) else: l = re.sub(u'>$', r' forum="0">', l, re.UNICODE) # Host and tld extraction. l = re.sub(r'( url="https{0,1}://)([^/]+)\.([a-z]{2,4})(|/|%)([^"]*")', r'\1\2.\3\4\5 urldomain="\2.\3" tld="\3"', l) # Fix some known problems in doc attr values. l = re.sub(r'=" +"', r'="unknown"', l) # fix: attr=" " l = re.sub(r'="([^"]+)\\" ', r'="\1" ', l) # fix: attr="val\" doc = [l] else: doc = doc + [l] if re.match(u'^</doc>', l, re.UNICODE): break return doc
def process_lines(lines, NONBREAKING_PREFIX): # loop text, add lines together until we get a blank line or a <p> out_text = '' text = "" for line in lines: line = line.strip() m = re_tag.match(line) if m is None: m = regex.match('^\s*$', line) if m is not None: # time to process this block, we've hit a blank or <p> out_text += do_it_for(text, line, NONBREAKING_PREFIX) if regex.match('^\s*$', line) and len(text): ##if we have text followed by <P> out_text += "<P>\n" text = "" else: # append the text, with a space text += line + " " # do the leftover text if len(text): out_text += do_it_for(text, "", NONBREAKING_PREFIX) return out_text
def process_file(file_path, tagger, idf_doc_count, idf_table, threshold, maximum_words): """ Takes the uploaded file, detecs its type (plain text, alto XML, zip) and calls a parsing function accordingly. If everything succeeds it returns keywords and 200 code, returns an error otherwise. """ file_info = magic.from_file(file_path) lines = [] if re.match("^UTF-8 Unicode (with BOM) text", file_info): lines = lines_from_txt_file(file_path, encoding='utf-8-sig') elif re.match("^UTF-8 Unicode", file_info): lines = lines_from_txt_file(file_path, encoding='utf-8') elif re.match("^ASCII text", file_info): lines = lines_from_txt_file(file_path, encoding='utf-8') elif re.match('^XML 1.0 document', file_info) and \ (file_path.endswith('.alto') or file_path.endswith('.xml')): lines = lines_from_alto_file(file_path) elif re.match('^Zip archive data', file_info): lines = lines_from_zip_file(file_path) else: return {"eror": "Unsupported file type: {}".format(file_info)}, 400 if not lines: return {"error": "Empty file"}, 400 return keywords.get_keywords(lines, tagger, idf_doc_count, idf_table, threshold, maximum_words), 200
def guess_split(majiribun, reading): kanjis=[] matchreg_greedy='' matchreg_nongreedy='' for char in majiribun: if kanji_re.match(char): kanjis.append(char) matchreg_greedy += "(\p{Hiragana}+)" matchreg_nongreedy += "(\p{Hiragana}+?)" else: matchreg_greedy += re.escape(char) matchreg_nongreedy += re.escape(char) m = re.match(matchreg_greedy + '$', reading) if m: yomis = m.groups() yomis_nongreedy = re.match(matchreg_nongreedy + '$', reading).groups() if yomis != yomis_nongreedy: # Ambiguous! return None d = {} for idx in range(0, len(kanjis)): d[kanjis[idx]] = yomis[idx] return(d)
def testRegex(self): # Basic match, beginning of string self.assertEqual(1, match("foo", "foobar")) # Basic match, middle of string self.assertEqual(1, match("oba", "foobar")) # Basic match, no match self.assertEqual(0, match("obo", "foobar")) # Match with start qualifier self.assertEqual(1, match("^fo", "foobar")) # Match with start qualifier in body self.assertEqual(0, match("^bar", "foobar")) # Match with end qualifier self.assertEqual(1, match("bar$", "foobar")) # Match with end qualifier in body self.assertEqual(0, match("foo$", "foobar")) # Match with optional qualifier self.assertEqual(1, match("fo*b", "foobar")) # Match with optional qualifier 2 self.assertEqual(1, match("fooa*b", "foobar")) # Match with optional qualifier 3 self.assertEqual(1, match("a*foo", "foobar"))
def __init__(self, room, s, negative=True): """ парсит выражения типа '/5m jid blabla@server', 'nick exp regexp', etc. короче в стиле глюкса """ self.room = room self.negative = negative self.end_time, s = fetch_time(s) if s.count('||'): s, self.reason = s[:s.find('||')].strip(), s[s.find('||')+2:].strip() else: s, self.reason = s.strip(), '' if s.lower().startswith('jid '): self.by_jid = True s = s[4:].lower() if not s: raise ValueError elif s.lower().startswith('nick '): self.by_jid = False s = s[5:] if not s: raise ValueError else: self.by_jid = True self.regexp = False item = room.get(s, None) if item: if item.jid == item.realjid: raise NoJID(item.jid) else: self.value = item.realjid.lower() else: raise NickNotFound(s) return if s.lower().startswith('exp '): self.regexp = True s = s[4:] try: regex.match(s, '*****@*****.**') except: raise MyRegexpError(s) else: self.regexp = False self.value = s
def parseaddr(address): # This is probably not perfect address = string.strip(address) # Case 1: part of the address is in <xx@xx> form. pos = regex.search('<.*>', address) if pos >= 0: name = address[:pos] address = address[pos:] length = regex.match('<.*>', address) name = name + address[length:] address = address[:length] else: # Case 2: part of the address is in (comment) form pos = regex.search('(.*)', address) if pos >= 0: name = address[pos:] address = address[:pos] length = regex.match('(.*)', name) address = address + name[length:] name = name[:length] else: # Case 3: neither. Only an address name = '' name = string.strip(name) address = string.strip(address) if address and address[0] == '<' and address[-1] == '>': address = address[1:-1] if name and name[0] == '(' and name[-1] == ')': name = name[1:-1] return name, address
def faiordict2contigorder(file_name, file_format): '''Takes either a .fai or .dict file, and return a contig order dictionary, i.e., chrom_seq['chr1'] == 0''' assert file_format in ('fai', 'dict') contig_sequence = [] with open(file_name) as gfile: line_i = gfile.readline().rstrip('\n') while line_i: if file_format == 'fai': contig_match = re.match(r'([^\t]+)\t', line_i) elif file_format == 'dict': if line_i.startswith('@SQ'): contig_match = re.match(r'@SQ\tSN:([^\t]+)\tLN:', line_i) if contig_match: contig_i = contig_match.groups()[0].split(' ')[0] # some .fai files have space after the contig for descriptions. contig_sequence.append( contig_i ) line_i = gfile.readline().rstrip('\n') chrom_seq = {} for n,contig_i in enumerate(contig_sequence): chrom_seq[contig_i] = n return chrom_seq
def __init__(self, text, lv=None, lc=None, vc=None): if isinstance(text, Ex): self._text = text.text self._lc = text.lc self._vc = text.vc elif not isinstance(text, str): raise TypeError("text must be string") else: self._text = text if lv: if re.match(r'^[a-z]{3}-[0-9]{3,}$', lv): self._lc = lv[:3] self._vc = int(lv[4:]) else: raise ValueError("lv must be in the format xxx-000") elif lc and (vc != None): lc = lc.lower() if re.match(r'^[a-z]{3}$', lc): self._lc = lc else: raise ValueError("lc must be a 3-letter ISO 639 code") try: vc = int(vc) if vc < 0: raise ValueError("vc must be a positive integer") self._vc = vc except ValueError: raise ValueError("vc must be a positive integer") else: raise TypeError("{cls} requires lv".format(cls=self.__class__.__name__))
def create_activation(data, labels, standard_cols, group_labels=[]): activation = database.Activation() for i, col in enumerate(data): # Cast to integer or float if appropriate # if regex.match('[-\d]+$', col): # col = int(col) # elif regex.match('[-\d\.]+$', col): # col = float(col) # Set standard attributes if applicable and do validation where appropriate. # Generally, validation will not prevent a bad value from making it into the # activation object, but it will flag any potential issues using the "problem" column. if standard_cols[i] is not None: sc = standard_cols[i] # Validate XYZ columns: Should only be integers (and possible trailing decimals). # If they're not, keep only leading numbers. The exception is that ScienceDirect # journals often follow the minus sign with a space (e.g., - 35), which we strip. if regex.match('[xyz]$', sc): m = regex.match('(-)\s+(\d+\.*\d*)$', col) if m: col = "%s%s" % (m.group(1), m.group(2)) if not regex.match('(-*\d+)\.*\d*$', col): logging.debug("Value %s in %s column is not valid" % (col, sc)) activation.problems.append("Value in %s column is not valid" % sc) # col = regex.search('(-*\d+)', col).group(1) return activation col = (float(col)) elif sc == 'region': if not regex.search('[a-zA-Z]', col): logging.debug("Value in region column is not a string") activation.problems.append("Value in region column is not a string") setattr(activation, sc, col) # Always include all columns in record activation.add_col(labels[i], col) # Handle columns with multiple coordinates (e.g., 45;12;-12). # Assume that any series of 3 numbers in a non-standard column # reflects coordinates. Will fail if there are leading numbers!!! # Also need to remove space between minus sign and numbers; some ScienceDirect # journals leave a gap. if not i in standard_cols: cs = '([\-\.\s]*\d{1,3})' m = regex.search('%s[,;\s]+%s[,;\s]+%s' % (cs, cs, cs), unicode(col).strip()) if m: x, y, z = [regex.sub('-\s+', '-', c) for c in [m.group(1), m.group(2), m.group(3)]] logger.info("Found multi-coordinate column: %s\n...and extracted: %s, %s, %s" % (col, x, y, z)) activation.set_coords(x, y, z) activation.groups = group_labels return activation
def parse_line(line, perv_url): if not line or len(line.strip()) == 0: raise ValueError("STR_EMPTY") line = line.strip() spt = line.split('-') if len(spt) == 3: name_1 = spt[0] name_2 = spt[1] attrs = spt[2] attrs_spt = attrs.split(',') if not (len(attrs_spt) == 2 or (len(attrs_spt) == 1 and perv_url)): raise ValueError("STR_ENTRY_EMPTY") if not name_1 \ or not name_2 \ or not regex.match("^["+_cryllic+"\s]+$", name_1)\ or not regex.match("^["+_cryllic+"\s]+$", name_2)\ or len(name_1.split(' ')) != 2\ or len(name_2.split(' ')) != 2: raise ValueError("STR_NAME_FORMAT") if name_1 == name_2: raise ValueError("STR_SAME_NAMES") if len(attrs_spt) == 2 and perv_url: raise ValueError("STR_TAG_FORMAT") if not regex.match("^(?!\.)["+_cryllic+"\.]+(?<!\.)$", attrs_spt[0]): raise ValueError("STR_TAG_FORMAT") link_types = attrs_spt[0].split('.') if filter(lambda x: not x, link_types): raise ValueError("STR_TAG_FORMAT") arr = collections.Counter(link_types) doubled_tags = set(i for i in arr if arr[i]>1) if len(doubled_tags) != 0: raise ValueError("STR_TAG_DOUBLED:" + ",".join(doubled_tags)) url = attrs_spt[1] if len(attrs_spt) == 2 else perv_url if not regex.match("http://[\w\.]+/[\w]+$", url): raise ValueError("STR_LINK_FORMAT") """ sim_names = list(es.get_similar_names([name_1, name_2])) if isinstance(sim_names[0], basestring): raise ValueError(u"STR_SIMILAR_NAME:{},{}".format(name_1,sim_names[0])) if isinstance(sim_names[1], basestring): raise ValueError(u"STR_SIMILAR_NAME:{},{}".format(name_2,sim_names[1])) tags = filter(lambda x: not x[1], zip(link_types, es.check_tags(link_types))) if len(tags) != 0: raise ValueError(u"STR_TAG_NOT_FOUND:{}".format(",".join(map(lambda x: x[0], tags)))) """ return (name_1, name_2, link_types, url) else: raise ValueError("STR_FORMAT")
def test_zero_or_one(self): p = regex.build_regex("ba?") result = regex.match(p, "ba") self.assertTrue(result) result = regex.match(p, "b") self.assertTrue(result) result = regex.match(p, "aa") self.assertFalse(result)
def process_marked_lines(lines, markers, return_flags=[False, -1, -1]): """Run regexes against message's marked lines to strip quotations. Return all but the last quoted segment if it exists. >>> mark_message_lines(['Hello', 'From: [email protected]', '', '> Hi', 'tsem']) ['Hello'] Also returns return_flags. return_flags = [were_lines_deleted, first_deleted_line, last_deleted_line] """ # Pre-process marker sequence # if there are no splitter there should be no markers. However, allow markers if more than 3! if 's' not in markers and not re.search('(me*){3}', markers): markers = markers.replace('m', 't') # Look for forwards (don't remove anything on a forward) # if there is an f before the first split, then it's a forward. if re.match('[te]*f', markers): return_flags[:] = [False, -1, -1] return lines # Remove last quoted segment # match from the end of the markers list markers.reverse() # match for unmarked quote following split quotation = re.match(r'e*(te*)+(se*)+', markers) if not quotation: # match for inline replies if re_orig.match(r'e*[mfts]*((te*)+(me*)+)+[mfts]*((se*)+|(me*){2,})', markers): return_flags[:] = [False, -1, -1] return lines # match for normal reply with quote quotation = re_orig.match(r'e*(me*)+[mefts]*((se*)+|(me*){2,})', markers) if not quotation: # match for normal reply with quote and signature below quote if re.match(r'e*(te*)+(me*)+.*(s)+e*(te*)+', markers): quotation = re.match(r'e*(te*)+(me*)+.*(s)+', markers) markers.reverse() # If quotation, return it if quotation: start = len(markers) - quotation.end() + 1 end = len(markers) - quotation.start() - 1 return_flags[:] = True, start, end return lines[:start] + lines[end:] return_flags[:] = [False, -1, -1] return lines
def printDiseaseClass(outStream,line): outStream.write('<%s/%s> \n\ta %s;\n' % (Namespace, line[0], visumpointGene.VP_PharmGKBDisease)) outStream.write('\trdfs:label "%s"^^xsd:string ;\n' % (visumpointGene.strip(line[1]))) outStream.write('\tskos:prefLabel "%s"^^xsd:string ;\n' % (visumpointGene.strip(line[1]))) trip = "" useName = "" if len(line[2]) > 0 : for name in line[2].split(','): if name.startswith("\"") and name.endswith("\""): useName = name elif name.startswith("\""): trip = name elif name.endswith("\""): useName = (trip + "," +name).strip("\"") trip = "" else : useName = name if len(useName) > 0: visumpointGene.printAlternativeName(outStream, "", visumpointGene.strip(useName), visumpointGene.VP_PharmGKB,"\t\t") useName = "" db_id_type_strB = "(.+):(.+)\((.+)\[(.+)/(.+)\]" db_id_PP = r'(.+):(.+)\(([^()]++)\((.*)\)+' strExp = "(.+):(.+)\((.+)" if len(line[4]) > 0 : for name in line[4].split('),'): #db_id_type_strB = "(.+):(.+)\((.+)\[(.+)/(.+)\]" try: match = regex.match(db_id_type_strB, name, regex.M|regex.I) if (match): visumpointGene.printCrossReference(outStream, dbReference(match.group(1)), visumpointGene.strip(match.group(2)), visumpointGene.VP_PharmGKB,"\t\t") else : #db_id_PP = r'(.+):(.+)\(([^()]++)\((.*)\)+' try: match1 = regex.match( db_id_PP, name, re.M|re.I) if (match1): visumpointGene.printCrossReference(outStream, dbReference(match1.group(1)), visumpointGene.strip(match1.group(2)), visumpointGene.VP_PharmGKB,"\t\t") else : #strExp = "(.+):(.+)\((.+)" try: match2 = regex.match( strExp, name, re.M|re.I) if(match2): visumpointGene.printCrossReference(outStream, dbReference(match2.group(1)), visumpointGene.strip(match2.group(2)), visumpointGene.VP_PharmGKB,"\t\t") else : outStream.write("#ERROR : processing External References : %s\n" % name) outStream.write("#\t\t %s\n" % line[4]) except: e = sys.exc_info()[0] print( "Error: %s" % e ) except: e = sys.exc_info()[0] print( "Error: %s" % e ) except: e = sys.exc_info()[0] print( "Error: %s" % e ) outStream.write('\t. # %s %s\n\n' % (line[1], line[0]))
def replacement(m): # Ignore 'also cf. ix p. 393ff' and ' A' first, second = m[1], m[2] if regex.match(r".+p\.\s*", first) or regex.match(r".+A", first): return "{}{}".format(first, second) else: # replace n-dash with dash path = second.replace("−", "-") return '{}<a href="{}{}" target="_blank">{}</a>'.format(first, baseurl, path, second)
def test_match_many(self): p = regex.build_regex("ab[cde]fg") result = regex.match(p, "abcfg") self.assertTrue(result) result = regex.match(p, "abdfg") self.assertTrue(result) result = regex.match(p, "abefg") self.assertTrue(result) result = regex.match(p, "abfg") self.assertFalse(result)
def hey(what): what = what.strip() plain = regex.sub(u'[\s{}]+'.format(regex.escape(punctuation)), '', what) if not regex.match(u'^[\d\s]+$', plain): if regex.match(u'^\s*$', what): return 'Fine. Be that way!' if regex.match(u'^[\d\s\p{Lu}]+$', plain): return 'Whoa, chill out!' if what.endswith('?'): return 'Sure.' return 'Whatever.'
def catchup(coordinate_i, line_j, filehandle_j, chrom_sequence): ''' Keep reading the j_th vcf file until it hits (or goes past) the i_th coordinate, at which time the function stops reading and you can do stuff. Returns (True, Vcf_line_j) if the j_th vcf file contains an entry that matches the i_th coordinate. Returns (False, Vcf_line_j) if the j_th vcf file does not contain such an entry, and therefore the function has run past the i_th coordinate, by which time the programmer can decide to move into the next i_th coordiate. ''' coordinate_j = re.match( pattern_chr_position, line_j ) if coordinate_j: coordinate_j = coordinate_j.group() else: coordinate_j = '' # Which coordinate is behind? is_behind = whoisbehind( coordinate_i, coordinate_j, chrom_sequence ) # The file_j is already ahead, return the same line_j, but tag it "False" if is_behind == 0: reporter = (False, line_j) # The two coordinates are the same, return the same line_j, but tag it # "True" elif is_behind == 10: reporter = (True, line_j) # If file_j is behind, then needs to catch up: elif is_behind == 1: # Keep at it until line_j is no longer behind: while is_behind == 1: # Catch up line_j = filehandle_j.readline().rstrip() next_coord = re.match( pattern_chr_position, line_j ) if next_coord: coordinate_j = next_coord.group() else: coordinate_j = '' is_behind = whoisbehind( coordinate_i, coordinate_j, chrom_sequence ) # If file_j has caught up exactly to the position of coordinate_i: if is_behind == 10: reporter = (True, line_j) # If file_j has run past coordinate_i: elif is_behind == 0: reporter = (False, line_j) return reporter
def readDrugLabelsFile(infilename): drug = "" genes = "" x = 0 table = {} try: with open(infilename, 'r') as f: lineRaw = csv.reader(f, delimiter='\t', quoting=csv.QUOTE_NONE) for line in lineRaw: if x > START: print("Drug Labels : %s - %s" % (x,line[1])) try: match = regex.match(r'FDA Label for ([a-zA-Z0-9 \-,]+) and (.+)', line[1], regex.M|regex.I) if not match: match = regex.match(r'European Medicines Agency \(EMA\) Label for ([a-zA-Z0-9 \-,]+) and (.+)', line[1], regex.M|regex.I) if(match): drug = match.group(1) genes = match.group(2) else : match = regex.match(r'FDA Label for ([a-zA-Z0-9 \-,]+)', line[1], regex.M|regex.I) if not match: match = regex.match(r'European Medicines Agency \(EMA\) Label for ([a-zA-Z0-9 \-,]+)', line[1], regex.M|regex.I) if (match): drug = match.group(1) genes = "" else: print("ERROR ON ROW - readDrugLabelsFile : %s - %s" % (line[1])) if len(drug) > 0: if drug in table: row = table[drug] else: row = {} table[drug] = row if(len(genes) > 0): for gene in genes.split(","): row[gene] = {gene, line[2], line[3],line[4]} else : row[NO_GENE] = {NO_GENE, line[2], line[3],line[4]} drug = "" genes = "" except: print("ERROR ON ROW - readDrugLabelsFile 2: %s - %s" % (x,line[0])) if x > END: return table x = x + 1 finally: f.close() return table
def untag(string, default_lv='und-000'): ap = Ap() string = string.replace('⁋⫷mn', '⫷mn') string = string.replace('⁋', '⫷mn⫸') string = string.replace('‣⫷ex', '⫷ex') string = string.replace('‣', '⫷ex⫸') mn = None dn = None superclass_ex = None attribute = None tag_split = [s for s in re.split(r'(⫷.*?⫸[^⫷]+)', string) if s] if not tag_split: return ap if tag_split[0] != '⫷mn⫸': tag_split.insert(0, '⫷mn⫸') if not tag_split[1].startswith('⫷'): tag_split[1] = '⫷ex⫸' + tag_split[1] for s in tag_split: obj, lv, text = re.search(r'⫷(.+?)(?::([a-z]{3}-\d{3}))?⫸(.*)', s).groups() if not lv: lv = default_lv if obj == 'mn': if mn: ap.append(mn) mn = Mn() if obj == 'ex': if dn: mn.dn_list.append(dn) dn = Dn(Ex(text, lv)) if obj == 'df': mn.df_list.append(Df(text, lv)) if re.match(r'[dm]cs[12]?', obj): if obj[-1] == '2': superclass_ex = Ex(text, lv) else: if obj.startswith('m'): mn.cs_list.append(Cs(Ex(text, lv), superclass_ex)) if obj.startswith('d'): dn.cs_list.append(Cs(Ex(text, lv), superclass_ex)) superclass_ex = None if re.match(r'[dm]pp', obj): if attribute is None: attribute = Ex(text, lv) else: if obj.startswith('m'): mn.pp_list.append(Pp(text, attribute)) if obj.startswith('d'): dn.pp_list.append(Pp(text, attribute)) attribute = None if dn: mn.dn_list.append(dn) if mn: ap.append(mn) return ap
def clean_keyword(self): keyword = self.cleaned_data.get("keyword", "").strip() if keyword == "" or (keyword and not regex.match("^[\d\*\#]+$", keyword, flags=regex.UNICODE)): raise forms.ValidationError(_("USSD code must contain only *,# and numbers")) return keyword
def readheaders(self): self.headers = list = [] self.status = '' headerseen = 0 while 1: line = self.fp.readline() if not line: self.status = 'EOF in headers' break if self.islast(line): break elif headerseen and line[0] in ' \t': # It's a continuation line. list.append(line) elif regex.match('^[!-9;-~]+:', line) >= 0: # It's a header line. list.append(line) headerseen = 1 else: # It's not a header line; stop here. if not headerseen: self.status = 'No headers' else: self.status = 'Bad header' # Try to undo the read. try: self.fp.seek(-len(line), 1) except IOError: self.status = \ self.status + '; bad seek' break
def split_example_file(example, dst_dir): lines = open(example).readlines() target_lines = [] target_require_lines = [] found_requires = False found_code = False for line in lines: m = re.match(r'goog.require\(\'(.*)\'\);', line) if m: found_requires = True target_require_lines.append(line) elif found_requires: if found_code or line not in ('\n', '\r\n'): found_code = True target_lines.append(line) target = open( os.path.join(dst_dir, os.path.basename(example)), 'w') target_require = open( os.path.join(dst_dir, os.path.basename(example) .replace('.js', '-require.js')), 'w') target.writelines(target_lines) target.close() target_require.writelines(target_require_lines) target_require.close()
def process(self, srcfo): fields = set() current = None outfo = tempfile.TemporaryFile('w+') outfo.write('sc.lzh2enFallbackData = {') for lineno, line in enumerate(srcfo): if line.startswith('#') or line.isspace(): continue m = regex.match(r'U\+(?<code>\w+)\s+(?<field>\w+)\s+(?<content>.*)', line) if not m: print('{}: {}'.format(lineno + 1,line)) fields.add(m['field']) if not current: current = {'code': m['code']} elif current['code'] != m['code']: self.writeout(current, outfo) current = {'code': m['code']} current[m['field']] = m['content'] outfo.write('\n}\n') outfo.flush() outfo.seek(0) return outfo.read()
def test(): s = 'aaa(((1+0)+1)+1)bbb' db_id_type_str = r'(.+):(.+)\((.+)' str1 = "NDFRT:N0000002071(Mycobacterium Infections [Disease/Finding])" db_id_type_strB = "(.+):(.+)\((.+)\[(.+)/(.+)\]" str2 = "SnoMedCT:109978004(T-cell lymphoma (clinical)" db_id_PP = r'(.+):(.+)\(([^()]++)\((.*)\)+' str3 = "MeSH:D015430(Weight Gain" strExp = "(.+):(.+)\((.+)" str4 = "MeSH:D015430(Weight Gain" match = regex.match(db_id_PP, str4, regex.M|regex.I) if match: print "matchObj.group() : ", match.group() print "matchObj.group(1) : ", match.group(1) print "matchObj.group(2) : ", match.group(2) print "matchObj.group(3) : ", match.group(3) print "matchObj.group(4) : ", match.group(4) print "matchObj.group(5) : ", match.group(5) else: print "No match!!"
def testSlurpy(testCase): m = re.match(reSlimp, testCase) if m: postFix = testCase[len(m.group()):] return re.fullmatch(reSlump, postFix) != None else: return False
def read_file(f): """ Reads a ply file and outputs a list of points. """ import regex f.readline() binary = True if "binary" in f.readline() else False if binary: import os name = convert_to_ascii(f) ascii_f = open(name) points = read_file(ascii_f) ascii_f.close() os.system("rm " + name) return points else: points = [] line = f.readline() while line: number = r"[\d\w-.]+" match = regex.match(r"^\s*(" + number + r")\s+(" + number + r")\s+(" + number + r")\s*$",line) try: points.append(Point(float(match.group(1)),float(match.group(2)),float(match.group(3)))) except ValueError: pass except AttributeError: pass line = f.readline() return points
def dispatch_shorthand_command(msg): commands = GlobalVars.parser.unescape(msg.content[3:]).split() output = [] processed_commands = [] for cmd in commands: count, cmd = regex.match(r"^(\d*)(.*)", cmd).groups() for _ in range(int(count) if count else 1): processed_commands.append(cmd) should_return_output = False for current_command, message in zip(processed_commands, get_last_messages(msg.room, len(processed_commands))): if current_command == "-": output.append("[:{}] <skipped>".format(message.id)) else: result = dispatch_reply_command(message, msg, current_command) if result: should_return_output = True output.append("[:{}] {}".format(message.id, result)) else: output.append("[:{}] <processed without return value>".format(message.id)) return "\n".join(output) if should_return_output else ""
def get_sheets_for_ref(tref, pad=True, context=1): """ Returns a list of sheets that include ref, formating as need for the Client Sidebar. """ oref = model.Ref(tref) if pad: oref = oref.padded_ref() if context: oref = oref.context_ref(context) ref_re = oref.regex() results = [] regex_list = oref.regex(as_list=True) ref_clauses = [{"sources.ref": {"$regex": r}} for r in regex_list] sheets = db.sheets.find({ "$or": ref_clauses, "status": "public" }, { "id": 1, "title": 1, "owner": 1, "sources.ref": 1, "views": 1 }).sort([["views", -1]]) for sheet in sheets: matched_refs = [] if "sources" in sheet: for source in sheet["sources"]: if "ref" in source: matched_refs.append(source["ref"]) matched_refs = [r for r in matched_refs if regex.match(ref_re, r)] for match in matched_refs: try: match = model.Ref(match) except InputError: continue ownerData = public_user_data(sheet["owner"]) com = { "category": "Sheets", "type": "sheet", "owner": sheet["owner"], "_id": str(sheet["_id"]), "anchorRef": match.normal(), "anchorVerse": match.sections[-1] if len(match.sections) else 1, "public": True, "commentator": user_link(sheet["owner"]), # legacy, used in S1 "text": "<a class='sheetLink' href='/sheets/%d'>%s</a>" % (sheet["id"], strip_tags( sheet["title"])), # legacy, used in S1 "title": strip_tags(sheet["title"]), "sheetUrl": "/sheets/" + str(sheet["id"]), "ownerName": ownerData["name"], "ownerProfileUrl": ownerData["profileUrl"], "ownerImageUrl": ownerData["imageUrl"], "views": sheet["views"] } results.append(com) return results
isIndication = 0 isDosage = 0 nodestart = 0 nodeend = 0 indTl = None dosTl = None for child in ET.fromstring(section).iter(): if child.tag == "code": if "code" in child.attrib: if child.attrib['code'] == "34068-7": isDosage = 1 elif child.attrib['code'] == "34067-9": isIndication = 1 if isIndication == 0 and child.tag == "title": if re.match(r".*\b(?:INDICATION|INDICATIONS)\b.*",str(ET.tostring(child).decode("utf-8")),re.IGNORECASE | re.DOTALL)\ and len(indicText)==0: isIndication = 1 if isDosage == 0 and child.tag == "title": if re.match(r".*\bDOSAGE\b.*\bADMINISTRATION\b.*",str(ET.tostring(child).decode("utf-8")),re.IGNORECASE | re.DOTALL)\ and len(dosText)==0: isDosage = 1 if isIndication == 1 and child.tag != "code": if ET.fromstring(section).find("text") != None: for text in ET.fromstring(section).findall("./text"): indicTitles.append("") indicText.append( str( BeautifulSoup(
def __init__(self, mainWin, modulesWithNewerFileDates): super(DialogPluginManager, self).__init__(mainWin.parent) self.ENABLE = _("Enable") self.DISABLE = _("Disable") self.parent = mainWin.parent self.cntlr = mainWin # copy plugins for temporary display self.pluginConfig = PluginManager.pluginConfig self.pluginConfigChanged = False self.uiClassMethodsChanged = False self.modelClassesChanged = False self.customTransformsChanged = False self.disclosureSystemTypesChanged = False self.hostSystemFeaturesChanged = False self.modulesWithNewerFileDates = modulesWithNewerFileDates parentGeometry = re.match("(\d+)x(\d+)[+]?([-]?\d+)[+]?([-]?\d+)", self.parent.geometry()) dialogX = int(parentGeometry.group(3)) dialogY = int(parentGeometry.group(4)) self.title(_("Plug-in Manager")) frame = Frame(self) # left button frame buttonFrame = Frame(frame, width=40) buttonFrame.columnconfigure(0, weight=1) addLabel = Label(buttonFrame, text=_("Find plug-in modules:"), wraplength=60, justify="center") addSelectLocalButton = Button(buttonFrame, text=_("Select"), command=self.selectLocally) ToolTip(addSelectLocalButton, text=_("Select python module files from the local plugin directory."), wraplength=240) addBrowseLocalButton = Button(buttonFrame, text=_("Browse"), command=self.browseLocally) ToolTip(addBrowseLocalButton, text=_("File chooser allows browsing and selecting python module files to add (or reload) plug-ins, from the local file system."), wraplength=240) addWebButton = Button(buttonFrame, text=_("On Web"), command=self.findOnWeb) ToolTip(addWebButton, text=_("Dialog to enter URL full path to load (or reload) plug-ins, from the web or local file system."), wraplength=240) addLabel.grid(row=0, column=0, pady=4) addSelectLocalButton.grid(row=1, column=0, pady=4) addBrowseLocalButton.grid(row=2, column=0, pady=4) addWebButton.grid(row=3, column=0, pady=4) buttonFrame.grid(row=0, column=0, rowspan=3, sticky=(N, S, W), padx=3, pady=3) # right tree frame (plugins already known to arelle) modulesFrame = Frame(frame, width=720) vScrollbar = Scrollbar(modulesFrame, orient=VERTICAL) hScrollbar = Scrollbar(modulesFrame, orient=HORIZONTAL) self.modulesView = Treeview(modulesFrame, xscrollcommand=hScrollbar.set, yscrollcommand=vScrollbar.set, height=7) self.modulesView.grid(row=0, column=0, sticky=(N, S, E, W)) self.modulesView.bind('<<TreeviewSelect>>', self.moduleSelect) hScrollbar["command"] = self.modulesView.xview hScrollbar.grid(row=1, column=0, sticky=(E,W)) vScrollbar["command"] = self.modulesView.yview vScrollbar.grid(row=0, column=1, sticky=(N,S)) modulesFrame.columnconfigure(0, weight=1) modulesFrame.rowconfigure(0, weight=1) modulesFrame.grid(row=0, column=1, columnspan=4, sticky=(N, S, E, W), padx=3, pady=3) self.modulesView.focus_set() self.modulesView.column("#0", width=120, anchor="w") self.modulesView.heading("#0", text=_("Name")) self.modulesView["columns"] = ("author", "ver", "status", "date", "update", "descr", "license") self.modulesView.column("author", width=100, anchor="w", stretch=False) self.modulesView.heading("author", text=_("Author")) self.modulesView.column("ver", width=60, anchor="w", stretch=False) self.modulesView.heading("ver", text=_("Version")) self.modulesView.column("status", width=50, anchor="w", stretch=False) self.modulesView.heading("status", text=_("Status")) self.modulesView.column("date", width=70, anchor="w", stretch=False) self.modulesView.heading("date", text=_("File Date")) self.modulesView.column("update", width=50, anchor="w", stretch=False) self.modulesView.heading("update", text=_("Update")) self.modulesView.column("descr", width=200, anchor="w", stretch=False) self.modulesView.heading("descr", text=_("Description")) self.modulesView.column("license", width=70, anchor="w", stretch=False) self.modulesView.heading("license", text=_("License")) classesFrame = Frame(frame) vScrollbar = Scrollbar(classesFrame, orient=VERTICAL) hScrollbar = Scrollbar(classesFrame, orient=HORIZONTAL) self.classesView = Treeview(classesFrame, xscrollcommand=hScrollbar.set, yscrollcommand=vScrollbar.set, height=5) self.classesView.grid(row=0, column=0, sticky=(N, S, E, W)) hScrollbar["command"] = self.classesView.xview hScrollbar.grid(row=1, column=0, sticky=(E,W)) vScrollbar["command"] = self.classesView.yview vScrollbar.grid(row=0, column=1, sticky=(N,S)) classesFrame.columnconfigure(0, weight=1) classesFrame.rowconfigure(0, weight=1) classesFrame.grid(row=1, column=1, columnspan=4, sticky=(N, S, E, W), padx=3, pady=3) self.classesView.focus_set() self.classesView.column("#0", width=200, anchor="w") self.classesView.heading("#0", text=_("Class")) self.classesView["columns"] = ("modules",) self.classesView.column("modules", width=500, anchor="w", stretch=False) self.classesView.heading("modules", text=_("Modules")) # bottom frame module info details moduleInfoFrame = Frame(frame, width=700) moduleInfoFrame.columnconfigure(1, weight=1) self.moduleNameLabel = Label(moduleInfoFrame, wraplength=600, justify="left", font=font.Font(family='Helvetica', size=12, weight='bold')) self.moduleNameLabel.grid(row=0, column=0, columnspan=4, sticky=W) self.moduleAuthorHdr = Label(moduleInfoFrame, text=_("author:"), state=DISABLED) self.moduleAuthorHdr.grid(row=1, column=0, sticky=W) self.moduleAuthorLabel = Label(moduleInfoFrame, wraplength=600, justify="left") self.moduleAuthorLabel.grid(row=1, column=1, columnspan=3, sticky=W) self.moduleDescrHdr = Label(moduleInfoFrame, text=_("description:"), state=DISABLED) self.moduleDescrHdr.grid(row=2, column=0, sticky=W) self.moduleDescrLabel = Label(moduleInfoFrame, wraplength=600, justify="left") self.moduleDescrLabel.grid(row=2, column=1, columnspan=3, sticky=W) self.moduleClassesHdr = Label(moduleInfoFrame, text=_("classes:"), state=DISABLED) self.moduleClassesHdr.grid(row=3, column=0, sticky=W) self.moduleClassesLabel = Label(moduleInfoFrame, wraplength=600, justify="left") self.moduleClassesLabel.grid(row=3, column=1, columnspan=3, sticky=W) ToolTip(self.moduleClassesLabel, text=_("List of classes that this plug-in handles."), wraplength=240) self.moduleVersionHdr = Label(moduleInfoFrame, text=_("version:"), state=DISABLED) self.moduleVersionHdr.grid(row=4, column=0, sticky=W) self.moduleVersionLabel = Label(moduleInfoFrame, wraplength=600, justify="left") self.moduleVersionLabel.grid(row=4, column=1, columnspan=3, sticky=W) ToolTip(self.moduleVersionLabel, text=_("Version of plug-in module."), wraplength=240) self.moduleUrlHdr = Label(moduleInfoFrame, text=_("URL:"), state=DISABLED) self.moduleUrlHdr.grid(row=5, column=0, sticky=W) self.moduleUrlLabel = Label(moduleInfoFrame, wraplength=600, justify="left") self.moduleUrlLabel.grid(row=5, column=1, columnspan=3, sticky=W) ToolTip(self.moduleUrlLabel, text=_("URL of plug-in module (local file path or web loaded file)."), wraplength=240) self.moduleDateHdr = Label(moduleInfoFrame, text=_("date:"), state=DISABLED) self.moduleDateHdr.grid(row=6, column=0, sticky=W) self.moduleDateLabel = Label(moduleInfoFrame, wraplength=600, justify="left") self.moduleDateLabel.grid(row=6, column=1, columnspan=3, sticky=W) ToolTip(self.moduleDateLabel, text=_("Date of currently loaded module file (with parenthetical node when an update is available)."), wraplength=240) self.moduleLicenseHdr = Label(moduleInfoFrame, text=_("license:"), state=DISABLED) self.moduleLicenseHdr.grid(row=7, column=0, sticky=W) self.moduleLicenseLabel = Label(moduleInfoFrame, wraplength=600, justify="left") self.moduleLicenseLabel.grid(row=7, column=1, columnspan=3, sticky=W) self.moduleImportsHdr = Label(moduleInfoFrame, text=_("imports:"), state=DISABLED) self.moduleImportsHdr.grid(row=8, column=0, sticky=W) self.moduleImportsLabel = Label(moduleInfoFrame, wraplength=600, justify="left") self.moduleImportsLabel.grid(row=8, column=1, columnspan=3, sticky=W) self.moduleEnableButton = Button(moduleInfoFrame, text=self.ENABLE, state=DISABLED, command=self.moduleEnable) ToolTip(self.moduleEnableButton, text=_("Enable/disable plug in."), wraplength=240) self.moduleEnableButton.grid(row=9, column=1, sticky=E) self.moduleReloadButton = Button(moduleInfoFrame, text=_("Reload"), state=DISABLED, command=self.moduleReload) ToolTip(self.moduleReloadButton, text=_("Reload/update plug in."), wraplength=240) self.moduleReloadButton.grid(row=9, column=2, sticky=E) self.moduleRemoveButton = Button(moduleInfoFrame, text=_("Remove"), state=DISABLED, command=self.moduleRemove) ToolTip(self.moduleRemoveButton, text=_("Remove plug in from plug in table (does not erase the plug in's file)."), wraplength=240) self.moduleRemoveButton.grid(row=9, column=3, sticky=E) moduleInfoFrame.grid(row=2, column=0, columnspan=5, sticky=(N, S, E, W), padx=3, pady=3) moduleInfoFrame.config(borderwidth=4, relief="groove") okButton = Button(frame, text=_("Close"), command=self.ok) ToolTip(okButton, text=_("Accept and changes (if any) and close dialog."), wraplength=240) cancelButton = Button(frame, text=_("Cancel"), command=self.close) ToolTip(cancelButton, text=_("Cancel changes (if any) and close dialog."), wraplength=240) okButton.grid(row=3, column=3, sticky=(S,E), pady=3) cancelButton.grid(row=3, column=4, sticky=(S,E), pady=3, padx=3) enableDisableFrame = Frame(frame) enableDisableFrame.grid(row=3, column=1, sticky=(S,W), pady=3) enableAllButton = Button(enableDisableFrame, text=_("Enable All"), command=self.enableAll) ToolTip(enableAllButton, text=_("Enable all plug ins."), wraplength=240) disableAllButton = Button(enableDisableFrame, text=_("Disable All"), command=self.disableAll) ToolTip(disableAllButton, text=_("Disable all plug ins."), wraplength=240) enableAllButton.grid(row=1, column=1) disableAllButton.grid(row=1, column=2) self.loadTreeViews() self.geometry("+{0}+{1}".format(dialogX+50,dialogY+100)) frame.grid(row=0, column=0, sticky=(N,S,E,W)) frame.columnconfigure(0, weight=0) frame.columnconfigure(1, weight=1) frame.rowconfigure(0, weight=1) window = self.winfo_toplevel() window.columnconfigure(0, weight=1) window.rowconfigure(0, weight=1) self.bind("<Return>", self.ok) self.bind("<Escape>", self.close) self.protocol("WM_DELETE_WINDOW", self.close) self.grab_set() self.wait_window(self)
try: data['author'] = doc.info[0]['Author'].decode("utf-16") except UnicodeDecodeError: data['author'] = str(doc.info[0]['Author']) except: a = 0 # do nothing ''' # Recuperation des donnees copy = 'author' # La section en cours de copie ('' : aucune) # copied = [] (voir debut du programme) cpt = 0 for line in lines: # Abstract if reg.match(r'^.{0,5}abstract(s)?.{0,100}\n', line.lower()) and not 'abstract' in copied: copy = 'abstract' copied.append(copy) # Introduction elif reg.match(r'^.{0,5}introduction(s)?.{0,3}\n', line.lower()) and not 'introduction' in copied: copy = 'introduction' copied.append(copy) # Fin Abstract #elif str("1.\n") in line or str("I.\n") in line or str("1.\n") in line: elif 'abstract' in copied and reg.match( r'^[1|I].{0,40}+\n', line) and not 'introduction' in copied: copy = '' # Fin Introduction elif reg.match(r'^2[\.|\ .+|\n]', line.lower()): copy = 'corps' ##fin d'introduction,on set le flag pour le corps
def _quote_identifier(self, name): if re.match(r'^[A-Za-z][A-Za-z_0-9]*$', name): return name return '`{}`'.format(name)
def _parse_map_uri(self): """ Parse and validate the map URI, for map-reduce processing. The map URI can point to the output URIs of previous workflow steps. The map URI template value can take the following forms: ${workflow->input-name}: 'input-name' must be part of workflow-level inputs (i.e., self._inputs) ${step-name->output}: 'step-name' must be a valid step name, and must be listed in the 'depend' list. Args: self: class instance. Returns: On success: True. On failure: False. """ if not self._step['map']['uri']: # map URI is an optional definition field self._map_uri = [] else: match = re.match(r'\${([^{}]+)->([^{}]+)}', self._step['map']['uri']) if match: if match.group( 1) == 'workflow': # use workflow-level input uri # check if uri name is in input list if match.group(2) in self._inputs: # make sure the input URIs to be used as the map URIs # are valid for input_uri in self._inputs[match.group(2)]: parsed_map_uri = URIParser.parse(input_uri) if not parsed_map_uri: msg = 'invalid map uri for inputs.{}: {}'\ .format( match.group(2), input_uri ) Log.an().error(msg) return self._fatal(msg) self._parsed_map_uris.append(parsed_map_uri) self._map_uris.append( parsed_map_uri['chopped_uri']) else: msg = 'invalid template reference to input: {}'\ .format(self._step['map']['uri']) Log.an().error(msg) return self._fatal(msg) else: # use uri from previous step # check if previous step is a dependency if match.group(1) in self._step['depend']: if match.group(2) == 'output': self._map_uris.append(self._depend_uris[ match.group(1)][0]['chopped_uri']) self._parsed_map_uris.append( self._depend_uris[match.group(1)][0]) else: msg = 'invalid template reference, must be "output": {}'\ .format(self._step['map']['uri']) Log.an().error(msg) return self._fatal(msg) else: # error, not a dependency msg = 'template reference to step must be listed as dependent: {}'\ .format(self._step['map']['uri']) Log.an().error(msg) return self._fatal(msg) else: # invalid format msg = 'invalid template value for step map uri: {}'.format( self._step['map']['uri']) Log.an().error(msg) return self._fatal(msg) return True
# Conor Rabbitte # Testing for Thompsons Construction import regex # If statement checks that we are in the main method if __name__ == "__main__": # Dictionary tests holds test information tests = [["a.b|b*", "bbbb", True], ["a.b|b*", "abbb", False], ["a?.b.c", "abc", True], ["a?.b.c", "aaaaabc", False], ["a+.b.c", "aaaaaabc", True], ["a+.b.c", "abbbc", False], ["b*", "b", True], ["b*", "", True], ["b*", "abbbb", False]] # Checks each test in tests and asserts them against the regex match function for test in tests: assert regex.match(test[0], test[1]) == test[2], test[0] + \ (" should match " if test[2] else " should not match ") + test[1] # Print message that tells the user is the test 'Passed' or 'Failed' print("Test: " + test[0] + ", " + test[1] + ("\t\tPassed" if test[2] else "\t\tFailed"))
def algorithm(Edgepair_Set, i, j, k): # start cycle count cycle = 0 # initiate the first cycle CYC = [] UsedPair = [] while len(Edgepair_Set) != len(UsedPair): for pair in Edgepair_Set: while pair not in UsedPair: # print('-------------new cycle--------------') inputpair = pair # print('input: ') # print(inputpair) # runs while the next input pair is not already part of the cycle. # returns FALSE if a cycle contains both oriented passes of a graph edge while inputpair not in CYC: # print('input not in CYC') # print(inputpair) basic_pattern1 = '[[:digit:]](\\+|-)' # eg: '1+' basic_pattern2 = '[h][[:digit:]]' # eg: 'h1' basic_pattern3 = '[h][[:digit:]](\\+|-)' # eg: 'h1+' basic_pattern4 = '[v][[:digit:]]' # eg: 'v1' basic_pattern5 = '[v][[:digit:]](\\+|-)' # eg: 'v1+' # select edge next to input pair # case: b = i if inputpair[1].isdigit() == True: # print('case: b = i') for str in Edgepair_Set: if str[0] == inputpair[1] + '+' or str[0] == inputpair[1] + '-': if str[2] == multiplication(str[0][1], inputpair[2]): # print('out: [i+,...]') outputpair = str if outputpair[2] == '+': outputpair_ = [outputpair[1], outputpair[0], '-'] elif outputpair[2] == '-': outputpair_ = [outputpair[1], outputpair[0], '+'] if outputpair_ in CYC: return False break # case: b = i+/- elif inputpair[1][0].isdigit() == True and inputpair[1].isdigit() == False: # print('case: b = i+') for str in Edgepair_Set: if str[0] == inputpair[1][0] and str[2] == negative( multiplication(inputpair[2], inputpair[1][1])): outputpair = str if outputpair[2] == '+': outputpair_ = [outputpair[1], outputpair[0], '-'] elif outputpair[2] == '-': outputpair_ = [outputpair[1], outputpair[0], '+'] if outputpair_ in CYC: return False break # case: b = hj elif inputpair[1][0] == 'h' and inputpair[1][1].isdigit() == True and len(inputpair[1]) == 2: # print('case: b = hj') for str in Edgepair_Set: if str[0] == inputpair[1] + '+' or str[0] == inputpair[1] + '-': if str[2] == multiplication(inputpair[2], str[0][2]): outputpair = str if outputpair[2] == '+': outputpair_ = [outputpair[1], outputpair[0], '-'] elif outputpair[2] == '-': outputpair_ = [outputpair[1], outputpair[0], '+'] if outputpair_ in CYC: return False # case: b = hj+/- elif inputpair[1][0] == 'h' and inputpair[1][1].isdigit() == True and len(inputpair[1]) == 3: # print('case: b = hj+') for str in Edgepair_Set: if str[0] == inputpair[1][:-1] and str[2] == negative( multiplication(inputpair[2], inputpair[1][2])): # print('output: [hi,...]') outputpair = str if outputpair[2] == '+': outputpair_ = [outputpair[1], outputpair[0], '-'] elif outputpair[2] == '-': outputpair_ = [outputpair[1], outputpair[0], '+'] # print('find output') # print(outputpair) if outputpair_ in CYC: return False # case: b = vk elif inputpair[1][0] == 'v' and inputpair[1][1].isdigit() == True and len(inputpair[1]) == 2: # print('case: b = vk') for str in Edgepair_Set: if str[0] == inputpair[1] + '+' or str[0] == inputpair[1] + '-': if str[2] == negative(multiplication(inputpair[2], str[0][2])): outputpair = str if outputpair[2] == '+': outputpair_ = [outputpair[1], outputpair[0], '-'] elif outputpair[2] == '-': outputpair_ = [outputpair[1], outputpair[0], '+'] if outputpair_ in CYC: return False # case: b = vk+/- elif inputpair[1][0] == 'v' and inputpair[1][1].isdigit() == True and len(inputpair[1]) == 3: # print('case: b = vk+') for str in Edgepair_Set: if str[0] == inputpair[1][:-1] and str[2] == multiplication(inputpair[2], inputpair[1][2]): # print('find output [vi,..]') outputpair = str if outputpair[2] == '+': outputpair_ = [outputpair[1], outputpair[0], '-'] elif outputpair[2] == '-': outputpair_ = [outputpair[1], outputpair[0], '+'] if outputpair_ in CYC: return False break # case: b = c & a = hi elif inputpair[1] == 'c' and re.match(basic_pattern2, inputpair[0]) != None: for str in Edgepair_Set: if str[0] == 'c' and re.match(basic_pattern4, str[1]) != None and str[2] == inputpair[2]: outputpair = str if outputpair[2] == '+': outputpair_ = [outputpair[1], outputpair[0], '-'] elif outputpair[2] == '-': outputpair_ = [outputpair[1], outputpair[0], '+'] if outputpair_ in CYC: return False break # case: b = c & a = vk elif inputpair[1] == 'c' and re.match(basic_pattern4, inputpair[0]) != None: for str in Edgepair_Set: if str[0] == 'c' and re.match(basic_pattern2, str[1]) != None and str[2] == negative( inputpair[2]): outputpair = str if outputpair[2] == '+': outputpair_ = [outputpair[1], outputpair[0], '-'] elif outputpair[2] == '-': outputpair_ = [outputpair[1], outputpair[0], '+'] if outputpair_ in CYC: return False break ''' print('output') print(outputpair) ''' # adds pair to current cycle CYC.append(inputpair) # continues with selected pair as input pair inputpair = outputpair # print('new input') # print(inputpair) # Increments count when cycle complete if len(CYC) > 1: cycle = cycle + 1 # tracks which edges have been used in a cycle UsedPair.extend(CYC) ''' print('CYC') print(CYC) print('cycleNum') print(cycle) ''' CYC = [] return cycle == i + j + k + 1
def word_to_tuples(self, word, normpunc=False): """Given a word, returns a list of tuples corresponding to IPA segments. Args: word (unicode): word to transliterate normpunc (bool): If True, normalizes punctuation to ASCII inventory Returns: list: A list of (category, lettercase, orthographic_form, phonetic_form, feature_vectors) tuples. The "feature vectors" form a list consisting of (segment, vector) pairs. For IPA segments, segment is a substring of phonetic_form such that the concatenation of all segments in the list is equal to the phonetic_form. The vectors are a sequence of integers drawn from the set {-1, 0, 1} where -1 corresponds to '-', 0 corresponds to '0', and 1 corresponds to '+'. """ def cat_and_cap(c): cat, case = tuple(unicodedata.category(c)) case = 1 if case == 'u' else 0 return unicode(cat), case def recode_ft(ft): try: return {'+': 1, '0': 0, '-': -1}[ft] except KeyError: return None def vec2bin(vec): return map(recode_ft, vec) def to_vector(seg): return seg, vec2bin(self.ft.segment_to_vector(seg)) def to_vectors(phon): if phon == '': return [(-1, [0] * self.num_panphon_fts)] else: return [to_vector(seg) for seg in self.ft.ipa_segs(phon)] tuples = [] word = unicode(word) # word = self.strip_diacritics.process(word) word = unicodedata.normalize('NFKD', word) word = unicodedata.normalize('NFC', word) while word: match = re.match('[A-Za-z]+', word) if match: span = match.group(0) cat, case = cat_and_cap(span[0]) phonword = self.transliterate(span) phonsegs = self.ft.ipa_segs(phonword) maxlen = max(len(phonsegs), len(span)) orth = list(span) + [''] * (maxlen - len(span)) phonsegs += [''] * (maxlen - len(phonsegs)) for p, o in zip(phonsegs, orth): tuples.append(('L', case, o, p, to_vectors(p))) word = word[len(span):] else: span = word[0] span = self.puncnorm.norm(span) if normpunc else span cat, case = cat_and_cap(span) cat = 'P' if normpunc and cat in self.puncnorm else cat phon = '' vecs = to_vectors(phon) tuples.append((cat, case, span, phon, vecs)) word = word[1:] return tuples
class Source: def __init__(self, path): self.path = path self.syms = set() for dirpath, dirnames, filenames in os.walk("src/wallet"): for filename in filenames: if filename.endswith(".cpp"): sources.setdefault(filename, []).append( Source(os.path.join(dirpath, filename))) sources["wallet.cpp"].append(Source("src/interfaces/wallet.cpp")) for line in sys.stdin: m = regex.match("^([^:]+):.*? undefined reference to `(.*?)'$", line) if m: filename, sym = m.groups() if filename in sources: for source in sources[filename]: source.syms.add(sym) for source in (s for sl in sources.values() for s in sl): with open(source.path) as fp: code = fp.read() for sym in source.syms: p = sym.find("(") if p < 0: pattern = r"\b(" + sym + r")\b" code = regex.sub(pattern, r"FIXME_IMPLEMENT_IPC_VALUE(\1)", code)
def validate_japanese(word): return (not regex.match(r'^\s*$', word) and not regex.match(r'\W', word) and regex.match(r'\p{Hiragana}|\p{Katakana}|\p{Han}', word))
Text_pdf_0_NL = ' '.join(all_pdf_text1.split()) Tokens = Text_pdf_0.split() Labels = y_final Real_Tokens = Text_pdf_0_NL.split() autors_surname = [] for i in range(len(autors[paper])): if i % 2 == 0: autors_surname.append(autors[paper][i]) autors_surname_lower = [] for i in range(len(autors_surname)): autors_surname_lower.append(autors_surname[i].lower()) if re.match('.\.', autors[paper][1]) == None: autors_forename = [] for i in range(len(autors[paper])): if i % 2 == 1: autors_forename.append(autors[paper][i].split()) autors_forename = list( np.concatenate((autors_forename), axis=None)) autors_forename_lower = [] for i in range(len(autors_forename)): autors_forename_lower.append(autors_forename[i].lower()) autors_surname_lower = list( np.concatenate( (autors_forename_lower, autors_surname_lower), axis=None))
def sortable_paragraph_number(string): MIN_DIGITS = 4 digits = len(regex.match(r"^\d*", string)[0]) if not digits: return string return "0 " * (MIN_DIGITS - digits) + string
def build_check_requires_timestamp(t): from zipfile import ZipFile unused_count = 0 all_provides = set() zf = ZipFile(PLOVR_JAR) for zi in zf.infolist(): if zi.filename.endswith('.js'): if not zi.filename.startswith('closure/goog/'): continue # Skip goog.i18n because it contains so many modules that it causes # the generated regular expression to exceed Python's limits if zi.filename.startswith('closure/goog/i18n/'): continue for line in zf.open(zi): m = re.match(r'goog.provide\(\'(.*)\'\);', line) if m: all_provides.add(m.group(1)) for filename in sorted(t.dependencies): if filename == 'build/src/internal/src/requireall.js': continue require_linenos = {} uses = set() lines = open(filename).readlines() for lineno, line in _strip_comments(lines): m = re.match(r'goog.provide\(\'(.*)\'\);', line) if m: all_provides.add(m.group(1)) continue m = re.match(r'goog.require\(\'(.*)\'\);', line) if m: require_linenos[m.group(1)] = lineno continue ignore_linenos = require_linenos.values() for lineno, line in enumerate(lines): if lineno in ignore_linenos: continue for require in require_linenos.iterkeys(): if require in line: uses.add(require) for require in sorted(set(require_linenos.keys()) - uses): t.info('%s:%d: unused goog.require: %r' % (filename, require_linenos[require], require)) unused_count += 1 all_provides.discard('ol') all_provides.discard('ol.MapProperty') class Node(object): def __init__(self): self.present = False self.children = {} def _build_re(self, key): if len(self.children) == 1: child_key, child = next(self.children.iteritems()) child_re = '\\.' + child._build_re(child_key) if self.present: return key + '(' + child_re + ')?' else: return key + child_re elif self.children: children_re = '(?:\\.(?:' + '|'.join( self.children[k]._build_re(k) for k in sorted(self.children.keys())) + '))' if self.present: return key + children_re + '?' else: return key + children_re else: assert self.present return key def build_re(self, key): return re.compile('\\b' + self._build_re(key) + '\\b') root = Node() for provide in all_provides: node = root for component in provide.split('.'): if component not in node.children: node.children[component] = Node() node = node.children[component] node.present = True provide_res = [ child.build_re(key) for key, child in root.children.iteritems() ] missing_count = 0 for filename in sorted(t.dependencies): if filename in INTERNAL_SRC or filename in EXTERNAL_SRC: continue provides = set() requires = set() uses = set() uses_linenos = {} for lineno, line in _strip_comments(open(filename)): m = re.match(r'goog.provide\(\'(.*)\'\);', line) if m: provides.add(m.group(1)) continue m = re.match(r'goog.require\(\'(.*)\'\);', line) if m: requires.add(m.group(1)) continue while True: for provide_re in provide_res: m = provide_re.search(line) if m: uses.add(m.group()) uses_linenos[m.group()] = lineno line = line[:m.start()] + line[m.end():] break else: break if filename == 'src/ol/renderer/layerrenderer.js': uses.discard('ol.renderer.Map') m = re.match(r'src/ol/renderer/(\w+)/\1(\w*)layerrenderer\.js\Z', filename) if m: uses.discard('ol.renderer.Map') uses.discard('ol.renderer.%s.Map' % (m.group(1), )) missing_requires = uses - requires - provides if missing_requires: for missing_require in sorted(missing_requires): t.info( "%s:%d missing goog.require('%s')" % (filename, uses_linenos[missing_require], missing_require)) missing_count += 1 if unused_count or missing_count: t.error('%d unused goog.requires, %d missing goog.requires' % (unused_count, missing_count)) t.touch()
def _inic_dic_vars(símismo): símismo.variables.clear() símismo._conv_nombres.clear() internos = ['FINAL TIME', 'TIME STEP', 'SAVEPER', 'INITIAL TIME'] for i, f in símismo.mod.doc().iterrows(): if f['Type'] == 'lookup': continue nombre = f['Real Name'] if nombre not in internos: nombre_py = f['Py Name'] unidades = f['Unit'] líms = literal_eval(f['Lims']) ec = f['Eqn'] obj_ec = Ecuación(ec) var_juego = obj_ec.sacar_args_func('GAME') is not None if símismo.tipo_mod == '.mdl': if regex.match(r'INTEG *\(', ec): tipo = 'nivel' else: tipo = 'auxiliar' # cambiaremos el resto después else: try: getattr(símismo.mod.components, 'integ_' + nombre_py) tipo = 'nivel' except AttributeError: tipo = 'auxiliar' parientes = obj_ec.variables() if tipo == 'auxiliar' and not len(parientes): tipo = 'constante' símismo.variables[nombre] = { 'val': getattr(símismo.mod.components, nombre_py)(), 'unidades': unidades, 'ec': ec, 'hijos': [], 'parientes': parientes, 'líms': líms, 'info': f['Comment'], 'tipo': tipo, 'ingreso': True, 'egreso': not var_juego } símismo._conv_nombres[nombre] = nombre_py # Aplicar los otros tipos de variables for niv in símismo.niveles(): ec = Ecuación(símismo.obt_ec_var(niv), dialecto='vensim') if símismo.tipo_mod == '.mdl': args_integ, args_inic = ec.sacar_args_func( 'INTEG') # Analizar la función INTEG de VENSIM # Identificar variables iniciales if args_inic in símismo.variables: símismo.variables[args_inic]['tipo'] = 'inicial' # Los flujos, por definición, son los otros parientes de los niveles. flujos = [ v for v in Ecuación(args_integ, dialecto='vensim').variables() if v not in internos ] else: flujos = ec.variables() for flujo in flujos: # Para cada nivel en el modelo... símismo.variables[flujo]['tipo'] = 'flujo' # Detectar los variables iniciales de XMILE if símismo.tipo_mod == '.xmile': for nv in ET.parse(símismo.archivo).getroot().iter( '{http://www.systemdynamics.org/XMILE}stock'): inic = nv.find('{http://www.systemdynamics.org/XMILE}eqn').text if inic in símismo.variables: símismo.variables[inic]['tipo'] = 'inicial' símismo.variables[nv.attrib['name']]['parientes'].add(inic) # Aplicar parientes a hijos for v, d_v in símismo.variables.items(): for p in d_v['parientes']: d_p = símismo.variables[p] d_p['hijos'].append(v)
def __eq__(self, other): if not isinstance(other, str): return False if self.pattern and not regex.match(self.pattern, other): return False return True
if {"byr", "iyr", "eyr", "hgt", "hcl", "ecl", "pid"} <= passport.keys(): valid_passports += 1 print(valid_passports) # Part Two valid_passports = 0 for passport in passports_parsed: if {"byr", "iyr", "eyr", "hgt", "hcl", "ecl", "pid"} <= passport.keys(): if ((1920 <= int(passport["byr"]) <= 2002) and (2010 <= int(passport["iyr"]) <= 2020) and (2020 <= int(passport["eyr"]) <= 2030) and (passport["ecl"] in ["amb", "blu", "brn", "gry", "grn", "hzl", "oth"]) and (len(passport["pid"]) == 9)): height_match = regex.match(r"\s*(?P<height>\d+)(?P<unit>cm|in)", passport["hgt"]) if height_match and ( (height_match.group("unit") == "in" and (59 <= int(height_match.group("height")) <= 76)) or (height_match.group("unit") == "cm" and (150 <= int(height_match.group("height")) <= 193))): hair_color_match = regex.match(r"#(?P<color>(?:[0-9]|[a-f])+)", passport["hcl"]) if hair_color_match: valid_passports += 1 print(valid_passports)
"window.scrollTo(0, document.body.scrollHeight);") time.sleep(SLEEP_EACH_SCROLL) ## Retrieve the divs list all_divs = driver.find_elements(By.CSS_SELECTOR, "div[class='v1Nh3 kIKUG _bz0w']") selected_divs = [x for x in all_divs if x not in s] ## Retrieve each image srcset attribute in each div in the divs list img_srcset = [ div.find_element(By.CSS_SELECTOR, "img").get_attribute('srcset') for div in selected_divs ] pattern = re.compile('^http\S+') ## Retrieve the correct image url from image srcset list string_url_imgs = [ re.match(pattern=pattern, string=x).group() for x in img_srcset ] for string_url_img in string_url_imgs: download_img_from_link(string_url_img, selfie_or_not=Boolean) ## Calculate new scroll height and compare with last scroll height ## ... (if the scrolling actually changed something) new_height = driver.execute_script("return document.body.scrollHeight") if new_height == last_height: break last_height = new_height count += 1 # count will be used for pagination afterwards s = set( all_divs
def get_normalized_subnames(self, src_names, separate_to_names=False, config=AutomataVariants.DEFAULT): ''' From a list of surnames for a given person, it creates a set of all possible surnames variants respecting settings of lowercase / non-accent / .. For example: * ["Havel"] => ["Havel"] * ["O'Connor"] => ["O'Connor", "Connor"] * ["van Beethoven"] => ["Ludwig", "Beethoven", "van Beethoven"] ''' if AutomataVariants.isLowercase(config): regex_flags = regex.IGNORECASE else: regex_flags = 0 # tmp_preposition in the form of "([Vv]an|[Zz]u|..)" tmp_prepositions = reUtils.list2FirstIncaseAlternation( self.NAME_PREPOSITIONS) regex_prepositions_remove = regex.compile( r" {} ".format(tmp_prepositions)) regex_prepositions_name = regex.compile( r" {} \p{{Lu}}\p{{L}}+".format(tmp_prepositions), flags=regex_flags) # tmp_prefixes in the form og "([Dd]\\'|[Oo]\\'|..)" tmp_prefixes = reUtils.list2FirstIncaseAlternation(self.NAME_PREFIXES) regex_prefixes_only_check = regex.compile( r"^{}\p{{Lu}}".format(tmp_prefixes), flags=regex_flags) regex_prefixes_only = regex.compile(r"^{}".format(tmp_prefixes)) str_regex_location_remove = r" (?:{}) .*".format("|".join( map(regex.escape, self.LOCATION_PREPOSITIONS))) regex_location_remove = regex.compile(str_regex_location_remove, flags=regex_flags) regex_name = regex.compile( r"^( ?(?:{})?\p{{Lu}}(\p{{L}}+)?(['-]\p{{Lu}}\p{{L}}+)*)+(?:{})?$". format(tmp_prefixes, str_regex_location_remove), flags=regex_flags ) # this should match only a nice name (must support prefixes) # regex_name = regex.compile(r"({})?[A-Z][a-z-']+[a-zA-Z]*[a-z]+".format(tmp_prefixes)) # this should match only a nice name (must support prefixes) names = set() for name in src_names: # normalize whitespaces name = regex.sub('\s+', ' ', name) subname_location = regex.search( r"([^ ]+" + str_regex_location_remove + r")", name) if subname_location: subname_location = subname_location.group() # remove a part of the name with location information (e.g. " of Polestown" from the name "Richard Butler of Polestown") name = regex_location_remove.sub("", name) if name.upper() != name: name = name.title() if separate_to_names: # split the name only (without prepositions) to the parts subnames = regex_prepositions_remove.sub(" ", name).split() else: subnames = [name] if subname_location: subnames.append(subname_location) for subname in subnames: if not len(subname): continue if subname[-1] == ",": subname = subname[:-1] # skip invalid / forbidden names if subname not in self.FORBIDDEN_NAMES: # normalize name to start with capital, including name with prefix (for example o'... => O'...) subname = subname[0].upper() + subname[1:] # remove accent, because python re module doesn't support [A-Z] for Unicode subname_without_accent = remove_accent(subname) result = regex_name.match(subname) if result: # add non-accent variant (if required) to processing (only if not same as base name) for subname in [subname, subname_without_accent] if ( AutomataVariants.isNonaccent(config) and subname != subname_without_accent) else [ subname ]: if (AutomataVariants.isLowercase(config)): subname = subname.lower() names.add(subname) if regex.match(regex_prefixes_only_check, subname): # add also a variant with small letter starting prefix => "o'Conor" if (not subname[0].islower()): names.add(subname[0].lower() + subname[1:]) # from "O'Connor" add also surname only without prefix => "Connor" nonprefix = regex_prefixes_only.sub( '', subname) names.add(nonprefix.lower( ) if AutomataVariants.isLowercase(config) else nonprefix.capitalize()) # search for names with preposition, i.e. "van Eyck" preposition_name = regex_prepositions_name.search(name.title()) if preposition_name: match = preposition_name.group() # normalize name to start with capital, including name with preposition (for example "van Eyck" => "Van Eyck") # Warning: contain space on the beginning to avoid match "Ivan Novák" as "van Novák" => it is needed to get substring from second char subname = match[1:].title() subname_without_accent = remove_accent(subname) # add non-accent variant (if required) to processing (only if not same as base name) for subname in [subname, subname_without_accent] if ( AutomataVariants.isNonaccent(config) and subname != subname_without_accent) else [subname]: if (AutomataVariants.isLowercase(config)): subname = subname.lower() names.add(subname) # add also a variant with small letter starting preposition => "van Eyck" if (not subname[0].islower()): names.add(subname[0].lower() + subname[1:]) return names
def AnnotateText(self, text): spans = [] cursor = 0 for match in self.re_latin_word.finditer(text): start, end = match.span() if start > cursor: region = text[cursor:start] spans.append((region, False)) region = text[start:end] spans.append((region, True)) cursor = end if cursor < len(text): region = text[cursor:] spans.append((region, False)) out_spans = [] sent_head = True for start_index in range(0, len(spans)): span, span_is_word = spans[start_index] if not span_is_word: out_spans.append((span, False, None)) continue def CheckSurfaceMatch(surface, title): if surface == title: return True if sent_head and surface != "I": norm_surface = surface[0].lower() + surface[1:] if norm_surface == title: return True return False annots = [] tokens = [] for index in range(start_index, len(spans)): token, token_is_word = spans[index] if token_is_word: tokens.append(token) phrase = " ".join(tokens) variants = [] variants.append((phrase, 1.0)) for infl_base in self.SearchInflections(phrase.lower()): variants.append((infl_base, 0.7)) if index == start_index: match = self.re_aux_contraction.search(token) if match: bare = match.group(1) variants.append((bare, 0.7)) for infl_base in self.SearchInflections(bare.lower()): variants.append((infl_base, 0.6)) suffix = match.group(2).lower() if suffix == "s" and bare.lower() in ("it", "he", "she"): variants.append(("be", 0.0001)) elif suffix == "ve": variants.append(("would", 0.0001)) elif suffix == "d": variants.append(("would", 0.0001)) variants.append(("have", 0.0001)) elif suffix == "ll": variants.append(("will", 0.0001)) elif suffix == "m" or suffix == "re": variants.append(("be", 0.0001)) elif suffix == "em": variants.append(("them", 0.0001)) match = self.re_not_contraction.search(token) if match: bare = match.group(1) lower_bare = bare.lower() if lower_bare == "wo": bare = "will" if lower_bare == "ca": bare = "can" if lower_bare == "sha": bare = "shall" variants.append((bare, 0.7)) variants.append(("not", 0.0001)) match = self.re_multi_possessive.search(token) if match: bare = match.group(1) + match.group(2) for infl_base in self.SearchInflections(bare): variants.append((infl_base, 0.7)) if token.find("-") > 0: for part in token.split("-"): if not regex.search(r"\p{Latin}{3,}", part): continue variants.append((part, 0.0002)) for infl_base in self.SearchInflections(part.lower()): variants.append((infl_base, 0.0001)) uniq_variants = set() uniq_words = set() for variant, var_score in variants: if variant in uniq_variants: continue uniq_variants.add(variant) for entry in self.SearchExact(variant, 10): word = entry["word"] if word in uniq_words: continue uniq_words.add(word) match = False if CheckSurfaceMatch(phrase, word): match = True else: for infl_name in self.infl_names: infl_values = entry.get(infl_name) if infl_values: for infl_value in regex.split(r"[,|]", infl_values): if CheckSurfaceMatch(phrase, infl_value): match = True break if match: break prob = float(entry.get("probability") or 0) prob_score = min(0.05, max(prob ** 0.5, 0.00001)) * 20 aoa = entry.get("aoa") or entry.get("aoa_concept") or entry.get("aoa_base") if aoa: aoa = float(aoa) else: aoa = math.log(prob + 0.00000001) * -1 + 3.5 aoa = min(max(aoa, 3), 20) aoa_score = (25 - min(aoa, 20.0)) / 10.0 entry["aoa_syn"] = int(aoa) tran_score = 1.0 if "translation" in entry else 0.5 item_score = math.log2(len(entry["item"]) + 1) labels = set() for item in entry["item"]: labels.add(item["label"]) label_score = len(labels) + 1 children = entry.get("child") child_score = math.log2((len(children) if children else 0) + 4) width_score = (200 if "translation" in entry else 10) ** word.count(" ") match_score = 1.0 if match else 0.2 score = var_score * prob_score * aoa_score * tran_score * item_score * label_score * child_score * match_score * width_score annots.append((entry, score)) elif index == start_index: break elif not regex.match(r"\s", token): break if len(tokens) > 3: break annots = sorted(annots, key=lambda x: x[1], reverse=True) annots = [x[0] for x in annots] out_spans.append((span, True, annots or None)) sent_head = span.find("\n") >= 0 or bool(regex.search(r"[.!?;:]", span)) return out_spans
def roll_dice(roll, *, functions=True, floats=True): """ Rolls dice in dice notation with advanced syntax used according to tinyurl.com/pydice :param roll: Roll in dice notation :return: Result of roll, and an explanation string """ roll = ''.join(roll.split()) roll = regex.sub(r'(?<=d)%', '100', roll, regex.IGNORECASE) roll = roll.replace('^', '**') roll = zero_width_split( r'((?<=[\(\),%^\/+*-])(?=.))|((?<=.)(?=[\(\),%^\/+*-]))', roll ) # Split the string on the boundary between operators and other chars string = [] results = [] for group in roll: if group in '()/=<>,%^+*-' or group in DEFAULT_FUNCTIONS: #Append operators without modification results.append(group) string.append(group) continue try: explode = regex.match( r'^((\d*)d(\d+))!$', group, regex.IGNORECASE ) # Regex for exploding dice, ie. 2d10!, 4d100!, d12!, etc. specific_explode = regex.match( r'^((\d*)d(\d+))!(\d+)$', group ) # Regex for exploding dice on specific number, ie. d20!10 or d12!4 comparison_explode = regex.match( r'^((\d*)d(\d+))!([<>])(\d+)$', group, regex.IGNORECASE ) # Regex for exploding dice with a comparison, ie. d20!>10, d6!<2 penetrate = regex.match( r'^((\d*)d(\d+))!p$', group, regex.IGNORECASE ) # Penetrating dice are the same as exploding except any dice after the initial number are added with a -1 penalty specific_penetrate = regex.match(r'^((\d*)d(\d+))!p(\d+)$', group, regex.IGNORECASE) # See above comparison_penetrate = regex.match(r'^((\d*)d(\d+))!p([<>])(\d+)$', group, regex.IGNORECASE) # See above reroll = regex.match( r'^((\d*)d(\d+))([Rr])$', group, regex.IGNORECASE) # Reroll on a one, matches 1d6R, 4d12r, etc. specific_reroll = regex.match( r'^((\d*)d(\d+))([Rr])(\d+)$', group, regex.IGNORECASE) # Reroll on a specific number comparison_reroll = regex.match( r'^((\d*)d(\d+))([Rr])([<>])(\d+)$', group, regex.IGNORECASE) # Reroll on a comparison success_comparison = regex.match( r'^((?:\d*)d(\d+))([<>])(\d+)$', group, regex.IGNORECASE ) # Regex for dice with comparison, ie. 2d10>4, 5d3<2, etc. success_fail_comparison = regex.match( r'^((?:\d*)d(\d+))(?|((<)(\d+)f(>)(\d+))|((>)(\d+)f(<)(\d+)))$', group, regex.IGNORECASE ) # Regex for dice with success comparison and failure comparison. keep = regex.match( r'^((?:\d*)d\d+)([Kk])(\d*)$', group, regex.IGNORECASE ) # Regex for keeping a number of dice, ie. 2d10K, 2d10k3, etc. drop = regex.match( r'^((?:\d*)d\d+)([Xx])(\d*)$', group, regex.IGNORECASE) # As above but with dropping dice and X individual = regex.match( r'^((\d*)d(\d+))([asm])(\d+)$', group, regex.IGNORECASE ) #Regex for rolling dice with a modifier attached to each roll normal = regex.match( r'^((\d*)d(\d+))$', group, regex.IGNORECASE) # Regex for normal dice rolls literal = regex.match( r'^(\d+)(?!\.)$', group, regex.IGNORECASE) # Regex for number literals. float_literal = regex.match(r'^(\.\d+)|(\d+.\d+)$', group, regex.IGNORECASE) # Regex for floats if explode is not None: # Handle exploding dice without a comparison modifier. type_of_dice = int(explode[3]) result = [] last_result = roll_group(explode[1]) result.extend(last_result) number_to_roll = num_equal(last_result, '=', type_of_dice) while number_to_roll != 0: last_result = roll_group( str(number_to_roll) + 'd' + str(type_of_dice)) # Reroll dice result.extend(last_result) number_to_roll = num_equal( last_result, '=', type_of_dice ) # Check how many dice we have to reroll again results.append(sum(result)) roll = ','.join( [('!' + str(i) if i == type_of_dice else str(i)) for i in result] ) # Build a string of the dice rolls, adding an exclamation mark before every roll that resulted in an explosion. string.append('[%s]' % roll) elif specific_explode is not None: # Handle exploding dice without a comparison modifier. type_of_dice = int(specific_explode[3]) comparator = int(specific_explode[4]) assert 0 < comparator <= type_of_dice result = [] last_result = roll_group(specific_explode[1]) result.extend(last_result) number_to_roll = num_equal(last_result, '=', comparator) while number_to_roll != 0: last_result = roll_group( str(number_to_roll) + 'd' + str(type_of_dice)) result.extend(last_result) number_to_roll = num_equal(last_result, '=', comparator) results.append(sum(result)) roll = ','.join( [('!' + str(i) if i == comparator else str(i)) for i in result] ) # Build a string of the dice rolls, adding an exclamation mark before every roll that resulted in an explosion. string.append('[%s]' % roll) elif comparison_explode is not None: # Handle exploding dice with a comparison modifier type_of_dice = int(comparison_explode[3]) comparator = int(comparison_explode[5]) if comparison_explode[ 4] == '>': # Ensure comparison is within bounds assert 0 < comparator < type_of_dice else: assert 1 < comparator <= type_of_dice result = [] last_result = roll_group(comparison_explode[1]) result.extend(last_result) if comparison_explode[4] == '>': number_to_roll = num_equal(last_result, '>', comparator) while number_to_roll != 0: last_result = roll_group( str(number_to_roll) + 'd' + str(type_of_dice)) result.extend(last_result) number_to_roll = num_equal(last_result, '>', comparator) roll = ','.join( [('!' + str(i) if i > comparator else str(i)) for i in result] ) # Same as on other explodes except with a > or < comparison else: number_to_roll = num_equal(last_result, '<', comparator) while number_to_roll != 0: last_result = roll_group( str(number_to_roll) + 'd' + str(type_of_dice)) result.extend(last_result) number_to_roll = num_equal(last_result, '<', comparator) roll = ','.join( [('!' + str(i) if i < comparator else str(i)) for i in result] ) # Same as on other explodes except with a > or < comparison results.append(sum(result)) string.append('[%s]' % roll) elif penetrate is not None: # Handle penetrating dice without a comparison modifier. type_of_dice = int(penetrate[3]) first_num = int(penetrate[2]) result = [] last_result = roll_group(penetrate[1]) result.extend(last_result) number_to_roll = num_equal(last_result, '=', type_of_dice) while number_to_roll != 0: last_result = roll_group( str(number_to_roll) + 'd' + str(type_of_dice)) result.extend(last_result) number_to_roll = num_equal(last_result, '=', type_of_dice) pre_result = result[: first_num] # Add the first rolls with no modifier pre_result.extend([ x - 1 for x in result[first_num:] ]) # Add the second rolls with a -1 modifier results.append(sum(pre_result)) roll = ','.join( [ '!' + str(i) if i == type_of_dice else str(i) for i in result[:first_num] ] ) # Add the first numbers, without the -1 but with a ! when roll is penetration roll += ( ',' if len(pre_result) > first_num else '' ) # Only add the comma in between if there's at least one penetration roll += ','.join([ ('!' + str(i) + '-1' if i == type_of_dice else str(i) + '-1') for i in result[first_num:] ]) # Add the penetration dice with the '-1' tacked on the end string.append('[%s]' % roll) elif specific_penetrate is not None: # Handle penetrating dice without a comparison modifier. type_of_dice = int(specific_penetrate[3]) first_num = int(specific_penetrate[2]) comparator = int(specific_penetrate[4]) assert 0 < comparator <= type_of_dice result = [] last_result = roll_group(specific_penetrate[1]) result.extend(last_result) number_to_roll = num_equal(last_result, '=', comparator) while number_to_roll != 0: last_result = roll_group( str(number_to_roll) + 'd' + str(type_of_dice)) result.extend(last_result) number_to_roll = num_equal(last_result, '=', comparator) pre_result = result[:first_num] # Same as normal penetration pre_result.extend([x - 1 for x in result[first_num:]]) results.append(sum(pre_result)) roll = ','.join([ '!' + str(i) if i == comparator else str(i) for i in result[:first_num] ]) # Same as above roll += (',' if len(pre_result) > first_num else '') roll += ','.join([ ('!' + str(i) + '-1' if i == comparator else str(i) + '-1') for i in result[first_num:] ]) string.append('[%s]' % roll) elif comparison_penetrate is not None: # Handle penetrating dice without a comparison modifier. type_of_dice = int(comparison_penetrate[3]) comparator = int(comparison_penetrate[5]) first_num = int(comparison_penetrate[2]) if comparison_penetrate[ 4] == '>': # Ensure comparison is within bounds assert 0 < comparator < type_of_dice else: assert 1 < comparator <= type_of_dice result = [] last_result = roll_group(comparison_penetrate[1]) result.extend(last_result) # Do penetration based on more than or less than sign. if comparison_penetrate[4] == '>': number_to_roll = num_equal(last_result, '>', comparator) while number_to_roll != 0: last_result = roll_group( str(number_to_roll) + 'd' + str(type_of_dice)) result.extend(last_result) number_to_roll = num_equal(last_result, '>', comparator) else: number_to_roll = num_equal(last_result, '<', comparator) while number_to_roll != 0: last_result = roll_group( str(number_to_roll) + 'd' + str(type_of_dice)) result.extend(last_result) number_to_roll = num_equal(last_result, '<', comparator) pre_result = result[:first_num] pre_result.extend([x - 1 for x in result[first_num:]]) results.append(sum(pre_result)) if comparison_penetrate[4] == '>': roll = ','.join([ '!' + str(i) if i > comparator else str(i) for i in result[:first_num] ]) # Same as above roll += (',' if len(pre_result) > first_num else '') roll += ','.join([ ('!' + str(i) + '-1' if i > comparator else str(i) + '-1') for i in result[first_num:] ]) else: roll = ','.join([ '!' + str(i) if i < comparator else str(i) for i in result[:first_num] ]) # Same as above roll += (',' if len(pre_result) > first_num else '') roll += ','.join([ ('!' + str(i) + '-1' if i < comparator else str(i) + '-1') for i in result[first_num:] ]) string.append('[%s]' % roll) elif reroll is not None: # Handle rerolling dice without a comparison modifier (ie. on 1) type_of_dice = int(reroll[3]) result_strings = [] roll_strings = [] result = roll_group(reroll[1]) repeat = True if reroll[ 4] == 'R' else False # Reroll just once or infinite number of times if repeat: #Handle rerolling the dice and building a string of all the rerolled ones for i in range(len(result)): prev = [result[i]] while result[i] == 1: result[i] = random.randint(1, type_of_dice) prev.append(result[i]) roll_strings.append([str(x) for x in prev]) else: for i in range(len(result)): prev = [result[i]] if result[i] == 1: result[i] = random.randint(1, type_of_dice) prev.append(result[i]) roll_strings.append([str(x) for x in prev]) results.append(sum(result)) for roll_string in roll_strings: roll_string.reverse() result_strings.append( '%s' % roll_string[0] + ('~' if len(roll_string) > 1 else '') + '~'.join(roll_string[1:])) #Build the string roll = ','.join(result_strings) string.append('[%s]' % roll) elif specific_reroll is not None: # Handle rerolling dice on a specific number, see reroll type_of_dice = int(specific_reroll[3]) comparator = int(specific_reroll[5]) assert 0 < comparator <= type_of_dice # Ensure comparison is within bounds result_strings = [] roll_strings = [] result = roll_group(specific_reroll[1]) repeat = True if specific_reroll[4] == 'R' else False if repeat: for i in range(len(result)): prev = [result[i]] while result[i] == comparator: result[i] = random.randint(1, type_of_dice) prev.append(result[i]) roll_strings.append([str(x) for x in prev]) else: for i in range(len(result)): prev = [result[i]] if result[i] == comparator: result[i] = random.randint(1, type_of_dice) prev.append(result[i]) roll_strings.append([str(x) for x in prev]) results.append(sum(result)) for roll_string in roll_strings: roll_string.reverse() result_strings.append('%s' % roll_string[0] + ( '~' if len(roll_string) > 1 else '') + '~'.join(roll_string[1:])) roll = ','.join(result_strings) string.append('[%s]' % roll) elif comparison_reroll is not None: # Handle rerolling dice with a comparison modifier. type_of_dice = int(comparison_reroll[3]) comparator = int(comparison_reroll[6]) if comparison_reroll[ 5] == '>': # Ensure comparison is within bounds assert 0 < comparator < type_of_dice else: assert 1 < comparator <= type_of_dice result_strings = [] roll_strings = [] result = roll_group(comparison_reroll[1]) repeat = True if comparison_reroll[4] == 'R' else False if comparison_reroll[5] == '>': if repeat: for i in range(len(result)): prev = [result[i]] while result[i] > comparator: result[i] = random.randint(1, type_of_dice) prev.append(result[i]) roll_strings.append([str(x) for x in prev]) else: for i in range(len(result)): prev = [result[i]] if result[i] > comparator: result[i] = random.randint(1, type_of_dice) prev.append(result[i]) roll_strings.append([str(x) for x in prev]) else: if repeat: for i in range(len(result)): prev = [result[i]] while result[i] < comparator: result[i] = random.randint(1, type_of_dice) prev.append(result[i]) roll_strings.append([str(x) for x in prev]) else: for i in range(len(result)): prev = [result[i]] if result[i] < comparator: result[i] = random.randint(1, type_of_dice) prev.append(result[i]) roll_strings.append([str(x) for x in prev]) results.append(sum(result)) for roll_string in roll_strings: roll_string.reverse() result_strings.append('%s' % roll_string[0] + ( '~' if len(roll_string) > 1 else '') + '~'.join(roll_string[1:])) roll = ','.join(result_strings) string.append('[%s]' % roll) elif success_comparison is not None: group_result = roll_group(success_comparison[1]) result = [] result_string = [] type_of_dice = int(success_comparison[2]) comparator = int(success_comparison[4]) if success_comparison[ 3] == '>': # Ensure comparison is within bounds assert 0 < comparator < type_of_dice else: assert 1 < comparator <= type_of_dice for die in group_result: if success_comparison[3] == '>': result.append(1 if die > comparator else 0) result_string.append( '!' + str(die) if die > comparator else str(die)) else: result.append(1 if die < comparator else 0) result_string.append( '!' + str(die) if die < comparator else str(die)) results.append(sum(result)) roll = ','.join( result_string ) # Craft the string, adding an exclamation mark before every string that passed the comparison. string.append('[%s]' % roll) elif success_fail_comparison is not None: group_result = roll_group(success_fail_comparison[1]) result = [] result_string = [] type_of_dice = int(success_fail_comparison[2]) success_comp = int(success_fail_comparison[5]) fail_comp = int(success_fail_comparison[7]) # Ensure both comparisons are within bounds if success_fail_comparison[4] == '>': assert 0 < success_comp < type_of_dice assert 1 < fail_comp <= type_of_dice else: assert 1 < success_comp <= type_of_dice assert 0 < fail_comp < type_of_dice for die in group_result: if success_fail_comparison[ 4] == '>': # Get the actual list of successes and fails with both comparisons if die > success_comp: result.append(1) result_string.append('!' + str(die)) elif die < fail_comp: result.append(-1) result_string.append('*' + str(die)) else: result.append(0) result_string.append(str(die)) else: if die < success_comp: result.append(1) result_string.append('!' + str(die)) elif die > fail_comp: result.append(-1) result_string.append('*' + str(die)) else: result.append(0) result_string.append(str(die)) results.append(sum(result)) # roll = ','.join(result_string) string.append('[%s]' % roll) elif keep is not None: # Handle rolling dice and keeping the x highest or lowest values group_result = roll_group(keep[1]) group_result.sort( reverse=True if keep[2] == 'K' else False ) # Uppercase is keep highest and lowercase is keep lowest. num_to_keep = int(keep[3] if keep[3] != '' else 1) assert 1 <= num_to_keep < len(group_result) results.append(sum(group_result[:num_to_keep])) roll = ','.join( [str(i) for i in group_result[:num_to_keep]] ) + ' ~~ ' # This time format the string with all kept rolls on the left and dropped rolls on the right roll += ','.join([str(i) for i in group_result[num_to_keep:]]) string.append('[%s]' % roll) elif drop is not None: group_result = roll_group(drop[1]) group_result.sort(reverse=True if drop[2] == 'X' else False) # Same thing as keep dice num_to_drop = int(drop[3] if drop[3] != '' else 1) assert 1 <= num_to_drop < len(group_result) results.append(sum(group_result[:num_to_drop])) roll = ','.join([str(i) for i in group_result[num_to_drop:] ]) + ' ~~ ' # Same as above. roll += ','.join([str(i) for i in group_result[:num_to_drop]]) string.append('[%s]' % roll) elif individual is not None: group_result = roll_group(individual[1]) result = [] for i, j in enumerate(group_result): #add to each roll if individual[4] == 'a': result.append(j + int(individual[5])) elif individual[4] == 's': result.append(j - int(individual[5])) elif individual[4] == 'm': result.append(j * int(individual[5])) else: raise ValueError results.append(sum(result)) roll = ','.join([ str(x) + individual[4] + individual[5] for x in group_result ]) #Create string with the modifier on each roll string.append('[%s]' % roll) elif normal is not None: group_result = roll_group(group) results.append(sum(group_result)) roll = ','.join([str(i) for i in group_result]) string.append('[%s]' % roll) elif literal is not None: results.append(int( literal[1])) # Just append the integer value string.append(literal[1]) elif float_literal is not None: if floats: results.append(float(group)) string.append(group) else: raise TypeError else: raise Exception except Exception: raise DiceGroupException('"%s" is not a valid dicegroup.' % group) parser = SimpleEval( floats=floats, functions=functions ) #The parser object parses the dice rolls and functions try: final_result = parser.eval(''.join( [str(x) for x in results])) #Call the parser to parse into one value if not floats: final_result = int(final_result) except Exception: raise DiceOperatorException('Error parsing operators and or functions') #Create explanation string and remove extraneous spaces explanation = ''.join(string) explanation = zero_width_split( r"""((?<=[\/%^+])(?![\/,]))| # Split between /, %, ^, and + ((?<![\/,])(?=[\/%^+]))| # Same as above ((?<=[^(])(?=-))(?!-[^[]*])| # Split in front of - that are not in a roll (?<=-)(?=[^\d()a-z])| # Same for splitting after - and before non-literals (?<=[\d)\]]-)(?=.)(?![^[]*])| # Split after a - that is not in a roll (?<=,)(?![^[]*])| # Split after a comma that is not in a roll (?<=([^,]\*))(?!\*)| # Split after a * that is not in a roll (?<![,\*])(?=\*) # Split before a * that is not in a roll""", explanation) #Split on ops to properly format the explanation explanation = ' '.join(explanation) explanation = explanation.strip() explanation = regex.sub(r'[ \t]{2,}', ' ', explanation) return final_result, explanation
import os import json import regex as re INPUT_DIR = '/var/judgments/' OUTPUT_DIR = './data/' SOFT_LIMIT = 1147483648 gathered = 0 files = (file for file in os.listdir(INPUT_DIR) if re.match(r'judgments-\d+.json', file)) for file in files: with open(os.path.join(INPUT_DIR, file)) as f: judgments = json.load(f) file_name, file_ext = os.path.splitext(file) for judgment in judgments['items']: content = re.sub(r'<[^>]*>|-?\n', '', judgment['textContent'], flags=re.WORD) with open(os.path.join(OUTPUT_DIR, f'{file_name}.{judgment["id"]}.txt'), 'w') as f2: f2.write(content) gathered += len(content) if gathered >= SOFT_LIMIT: break
def _generalized_check(script: str, word: str) -> bool: prop = ("Block" if script == "Katakana" or script == "Hiragana" else "Script") regex_string = rf"^[\p{{{prop}={script}}}']+$" return bool(regex.match(regex_string, word))
def _inic_dic_vars(símismo): # Borrar lo que podría haber allí desde antes. símismo.variables.clear() cuerpo = símismo.dic_doc['cuerpo'] l_tx_vars = [] nuevo_var = True for fl in cuerpo: f = fl.strip().rstrip('\\') if len(f): if nuevo_var: l_tx_vars.append(f) else: l_tx_vars[-1] += f nuevo_var = (f[-1] == '|') for tx_var in l_tx_vars: tx_ec, tx_unids_líms, tx_info = tx_var.strip('|').split('~') obj_ec = Ecuación(tx_ec, dialecto='vensim') if obj_ec.tipo == 'sub': continue var = obj_ec.nombre try: tx_unids, tx_líms = tx_unids_líms.split('[') except ValueError: tx_unids = tx_unids_líms tx_líms = '' if len(tx_líms): líms = tuple([ float(x) if x.strip() != '?' else None for x in tx_líms.strip(']').split(',') ][:2]) else: líms = (None, None) símismo.variables[var] = { 'val': None, 'unidades': tx_unids.strip(), 'ec': str(obj_ec), 'ingreso': None, 'dims': (1, ), # Para hacer 'líms': líms, 'subscriptos': None, # Para hacer 'hijos': [], 'parientes': obj_ec.variables(), 'egreso': None, 'info': tx_info.strip(), 'val_inic': False } for v, d_v in símismo.variables.items(): for p in d_v['parientes']: d_p = símismo.variables[p] d_p['hijos'].append(v) # Borrar lo que había antes en las listas siguientes: símismo.flujos.clear() símismo.auxiliares.clear() símismo.constantes.clear() símismo.niveles.clear() # Guardar una lista de los nombres de variables de tipo "nivel" símismo.niveles += [ x for x, d in símismo.variables.items() if regex.match(r'INTEG *\(', d['ec']) ] # Los flujos, por definición, son los parientes de los niveles. for niv in símismo.niveles: # El primer argumento de la función INTEG de VENSIM ec = Ecuación(símismo.variables[niv]['ec'], dialecto='vensim') arg_integ = ec.sacar_args_func('INTEG', i=1)[0] args_inic = ec.sacar_args_func('INTEG')[1] if args_inic in símismo.variables: símismo.variables[args_inic]['val_inic'] = True # Extraer los variables flujos flujos = [ v for v in Ecuación(arg_integ, dialecto='vensim').variables() if v not in símismo.internos ] for flujo in flujos: # Para cada nivel en el modelo... if flujo not in símismo.flujos and flujo not in símismo.niveles: # Agregar el flujo, si no está ya en la lista de flujos. símismo.flujos.append(flujo) # Los auxiliares son los variables con parientes que son ni niveles, ni flujos. símismo.auxiliares += [ x for x, d in símismo.variables.items() if x not in símismo.niveles and x not in símismo.flujos and len(d['parientes']) ] # Los constantes son los variables que quedan. símismo.constantes += [ x for x, d in símismo.variables.items() if not len(d['parientes']) and not any(h in símismo.flujos for h in d['hijos']) ]
dirs.sort(key=nkey) if not any( fname.upper().endswith(".PDF") for fname in os.listdir(subdir) ): continue nrope = ( os.path.basename(os.path.dirname(subdir)) + "_" + os.path.basename(subdir) ) print(str(countope) + "\t" + nrope) # print(str(ile) + "_" + str(countope) + "\t" + nrope) countope += 1 for file in natsorted(files): if file.upper().endswith(".PDF") and regex.match( r"^.+(-SZK-|-M-|-Z-).+\.PDF", file.upper() ): plik = os.path.join(subdir, file) try: doc = fitz.open(plik) strony = doc.pageCount if not strony == 1: with io.open( plikwynik, "a", encoding="utf-8" ) as wynik: wynik.write(str(strony) + "\t" + plik + "\n") continue except: with io.open(bledny, "a", encoding="utf-8") as bl: bl.write(plik + "\n") continue
import regex import sys RULE_REGEX = r'^(?P<container>[a-z ]+) bags contain (((?P<contained>\d+ [a-z ]+) bags?(, |.))+|(?:no other bags.))$' for line in sys.stdin.readlines(): print(regex.match(RULE_REGEX, line.strip()).groupdict())
def get_raw_dates(text, strict=False, base_date=None, return_source=False, locale=None) -> Generator: """ Find "raw" or potential date matches prior to false positive classification. :param text: raw text to search :param strict: whether to return only complete or strict matches :param base_date: base date to use for implied or partial matches :param return_source: whether to return raw text around date :param locale: locale object :return: """ # Setup base date if not base_date: base_date = datetime.datetime.now().replace(day=1, month=1, hour=0, minute=0, second=0, microsecond=0) # Find potential dates date_finder = DateFinder(base_date=base_date) for extra_token in date_finder.EXTRA_TOKENS_PATTERN.split('|'): if extra_token != 't': date_finder.REPLACEMENTS[extra_token] = ' ' # Iterate through possible matches possible_dates = list(date_finder.extract_date_strings(text, strict=strict)) possible_matched = [] for i, possible_date in enumerate(possible_dates): # Get date_string = possible_date[0] index = possible_date[1] date_props = possible_date[2] # Cleanup "day of" strings if "of" in date_props["extra_tokens"] or "OF" in date_props[ "extra_tokens"]: num_dig_mod = len(possible_dates[i - 1][2]["digits_modifier"]) if i > 0 and not possible_matched[i - 1] and num_dig_mod == 1: date_props["digits_modifier"].extend( possible_dates[i - 1][2]["digits_modifier"]) date_string = possible_dates[i - 1][2]["digits_modifier"].pop() \ .replace("st", "") \ .replace("nd", "") \ .replace("rd", "") \ .replace("th", "") + date_string # Skip only digits modifiers num_dig_mod = len(date_props["digits_modifier"]) num_dig = len(date_props["digits"]) num_days = len(date_props["days"]) num_month = len(date_props["months"]) num_slash = date_props["delimiters"].count("/") num_point = date_props["delimiters"].count(".") num_hyphen = date_props["delimiters"].count("-") # Remove double months if num_month > 1: possible_matched.append(False) continue # Remove wrong months like Dec*ided or Mar*tin if num_month == 1 and date_props['extra_tokens'] \ and (date_props['months'][0] + date_props['extra_tokens'][-1]) in date_string: possible_matched.append(False) continue # Check strange strings if num_dig_mod > 0 and num_dig == 0: possible_matched.append(False) continue # Skip DOW only if num_days > 0 and num_dig == 0: possible_matched.append(False) continue # Skip DOM only if num_month == 0 and num_dig_mod == 0 and num_dig <= 1: possible_matched.append(False) continue # Skip odd date like "1 10" if re.match(r'\d{1,2}\s+\d{1,2}', date_string): possible_matched.append(False) continue # Skip floats if num_point and not num_month and not re.match( r'\d{2}\.\d{2}\.\d{2,4}', date_string): possible_matched.append(False) continue # Skip odd months from string like "Nil 62. Marquee" if re.search(r'\d{2,4}\.\s*[A-Za-z]', date_string): possible_matched.append(False) continue # Skip fractions if (num_slash == 1 or num_hyphen == 1) and num_dig > 2: possible_matched.append(False) continue # Skip three-digit blocks and double zero years found_triple = False found_dz = False for digit in date_props["digits"]: if len(digit) == 3: found_triple = True if digit.startswith("00"): found_dz = True if found_triple or found_dz: possible_matched.append(False) continue # Skip "may" alone if num_dig == 0 and num_days == 0 and "".join( date_props["months"]).lower() == "may": possible_matched.append(False) continue # Skip cases like "13.2 may" or "12.12may" if (num_dig > 0 and (num_point + num_slash + num_hyphen) > 0 and "".join(date_props["months"]).lower() == "may"): possible_matched.append(False) continue # Cleanup for token in sorted(date_props["extra_tokens"], key=len, reverse=True): if token.lower() in ["to", "t"]: continue date_string = date_string.replace(token, "") date_string = date_string.strip() date_props["extra_tokens"] = [] # Skip strings too long if len(date_string) > DATE_MAX_LENGTH: possible_matched.append(False) continue # Skip numbers only match_delims = set("".join(date_props["delimiters"])) bad_delims = {",", " ", "\n", "\t"} len_diff_set = len(match_delims - bad_delims) if len_diff_set == 0 and num_month == 0: possible_matched.append(False) continue # Parse and skip nones date = None try: date_string_tokens = date_string.split() for cutter in range(len(date_string_tokens)): for direction in (0, 1): if cutter > 0: if direction: _date_string_tokens = date_string_tokens[cutter:] else: _date_string_tokens = date_string_tokens[:-cutter] date_string = ' '.join(_date_string_tokens) try: date = date_finder.parse_date_string(date_string, date_props, locale=locale) # pylint: disable=broad-except except: date = None if date: break else: continue # executed if the loop ended normally (no break) break # executed if 'continue' was skipped (break) except TypeError: possible_matched.append(False) continue if date and not check_date_parts_are_in_date(date, date_props): date = None if not date: possible_matched.append(False) continue # for case when datetime.datetime(2001, 1, 22, 20, 1, tzinfo=tzoffset(None, -104400)) if hasattr(date, 'tzinfo'): try: _ = date.isoformat() except ValueError: possible_matched.append(False) continue possible_matched.append(True) if isinstance( date, datetime.datetime) and date.hour == 0 and date.minute == 0: date = date.date() # Append if return_source: yield (date, index) else: yield date
# Printing Header on the Screen print( "-----------------------Muhammad Luqman------------------G00353385----------------Graph Theory Project------------------------" ) # prompt user to enter infix notation to match u_infix = input("Please enter an infix :") # validation if user leaves infix empty while u_infix == "": print("Cannot be Empty!") # prompt user to enter infix notation to match u_infix = input("Please enter an infix :") # Prompt user to enter the string to match string = input("please enter a string :") # validation if user leaves string empty while string == "": print("Cannot be Empty!") # Prompt user to enter the string to match string = input("please enter a string :") # if the result is True then it prints the following statement if (regex.match(u_infix, string)) == True: print("The inputs Matched!") # if the result is False then it prints the following statement elif (regex.match(u_infix, string)) == False: print("The inputs not Matched")
def segment_inner(self, e): msgstr_mapping = {} def sanitize_key(key): return regex.sub(r'[\p{punct}\s]', '', key).casefold() msgstr_elements = e.cssselect('[data-msgstr]') for msgstr_element in msgstr_elements: msgstr_mapping[sanitize_key(msgstr_element.text_content())] = { 'value': msgstr_element.get('data-msgstr'), 'used': False } for msgstr_element in msgstr_elements: msgstr_element.drop_tag() variant_notes = {} var_elements = e.cssselect('.var') for var_element in var_elements: variant_notes[var_element.text_content()] = 'VAR: {} → {}'.format( var_element.text_content(), var_element.get('title')) for var_element in var_elements: var_element.drop_tag() html_string = lxml.html.tostring(e, encoding='unicode').strip() html_string = html_string.replace('\n', ' ').replace('\xa0', ' ').replace( '\xad', '') m = regex.match(r'<[^<]+>[ \n\t]*(.*)</\w+>', html_string, flags=regex.DOTALL) if not m: raise ValueError(html_string) html_string = m[1] m = regex.match(r'(?i)((?:<a[^<]*></a>[ \n\t]*)*)(.*)', html_string) if m[1]: self.add_token(TokenType.comment, m[1]) html_string = m[2] html_string = self.mangle(html_string) logger.info(html_string) pattern = r'(?<!\d+)([.;:!?—,。:;!…?—](?:\p{punct}+|[ \n\t]*MANG.[0-9]+GLE[\p{punct}\d]*MANG.[0-9]+GLE)*[\u200b\s]*|(?<!^)…[ \n\t]*(?:pe[ \n\t]*…[ \n\t]*)?[.;:!?—;:。,,。:;!…?—]*)(?:MANGR[0-9]+GLE)*' parts = regex.split(pattern, html_string) segments = [ ''.join(parts[i:i + 2]).strip() for i in range(0, len(parts), 2) ] for i, segment in list(enumerate(segments)): m = regex.match(r'(?r)[「「『]$', segment) if m: print(segments[i], segments[i + 1], segment[-1] + segments[i + 1]) segments[i + 1] = segment[-1] + segments[i + 1] segments[i] = segment[:-1] sentence_count = 0 for segment in segments: if not segment: continue segment = self.demangle(segment) lines = regex.split('(<br[^>]*>|(?:<a [^>]+></a>)*$)', segment) for i in range(0, len(lines), 2): line = lines[i].strip() if line: m = regex.match(r'^[ \n\t]*(</\w+>)(.*)', line, flags=regex.DOTALL) if m: if self.token_stream[ -1].type == TokenType.newline and self.token_stream[ -2].type == TokenType.text: self.token_stream[-2].value += m[1] line = m[2].strip() sentence_count += 1 ctxt = '{}:{}.{}'.format(self.uid, self.paragraph_count, sentence_count) msgstr = '' for var_text in list(variant_notes): if var_text in line: self.add_token(TokenType.comment_note, variant_notes.pop(var_text)) if line and not line.isspace(): try: key = sanitize_key( lxml.html.fromstring(line).text_content()) except Exception as e: globals().update(locals()) raise if key in msgstr_mapping: msgstr_mapping[key]['used'] = True msgstr = msgstr_mapping[key]['value'] self.add_token(TokenType.text, line, ctxt, msgstr) if i + 1 < len(lines): br = lines[i + 1].strip() if br: self.add_token(TokenType.comment, br) self.add_token(TokenType.newline) for key, obj in msgstr_mapping.items(): if obj['used'] == False: print('Failed to find use for {}: {}'.format( key, obj['value']))
def iterate_map_uri(self): """ Expand step templates for each map-reduce item. Items must match the map-reduce regex to be included, and are stored in the self._map list. If no map_uri is given, only one item "." is included in _map. Args: self: class instance. Returns: On success: True. On failure: False. """ def multiple_replace(string, rep_dict): """ Replace multiple string patterns simultaneously. Args: string: The string to be replaced. rep_dict: Dictionary containing key and values as patterns that should be replaced. Returns: On success: The string with all the patterns replaced. On failure: False. """ pattern = re.compile( "|".join([re.escape(k) for k in rep_dict.keys()]), re.M) return pattern.sub(lambda x: rep_dict[x.group(0)], string) # iterate map items if self._map_uris == []: # no mapping, run only one job self._map = [{ 'filename': 'root', 'chopped_uri': '', 'replace': {}, 'template': {}, 'status': 'PENDING', 'attempt': 0, 'run': [{}] }] else: # list uri contents and place into matched files file_list = self._get_map_uri_list() if file_list is False: msg = 'cannot get list of items from map uris: {}'.format( self._map_uris) Log.an().error(msg) return self._fatal(msg) if file_list == []: # this folder should never be empty msg = 'map uri contents cannot be empty: {}'.format( self._map_uris) Log.an().error(msg) return self._fatal(msg) for f in file_list: # check if file matches regex match = re.match(self._step['map']['regex'], f['filename']) if match: groups = list(match.groups()) replace = {} for i, group in enumerate(groups): replace[str('${' + str(i + 1) + '}')] = str(group) self._map.append({ 'filename': f['filename'], 'chopped_uri': f['chopped_uri'], 'replace': replace, 'template': {}, 'status': 'PENDING', 'attempt': 0, 'run': [{}] }) if not self._map: msg = ('map uri contents must include at least' ' one item matching regex: {}').format(self._map_uris) Log.an().error(msg) return self._fatal(msg) # iterate through items, expand templates for map_item in self._map: replace = map_item['replace'].copy() replace.update(self._replace) ##### replace map uri base with value corresponding to map item replace[self._step['map']['uri']] = map_item['chopped_uri'] for template_key in self._step['template']: if isinstance(self._step['template'][template_key], str): map_item['template'][template_key] = multiple_replace( self._step['template'][template_key], replace) else: map_item['template'][template_key]\ = self._step['template'][template_key] return True