def siman_smk_exctractor(smk_text): split = re.split(u'\s', smk_text) simanim = [] for word in split: if not word or word == u'סימן' or word == u'סעיף': continue word = re.sub(u"[;.,']", u"", word) if re.search(u'-', word): borders = re.search(u"(.*?)-(.*)", word) start = getGematria(borders.group(1)) end = getGematria(borders.group(2)) for siman in range(start, end+1): simanim.append(siman) if not is_hebrew_number(word): if not check_vav(word): # print smk_text, simanim return simanim else: simanim.append(check_vav(word)) else: smk_siman = getGematria(word) simanim.append(smk_siman) # print smk_text, simanim return simanim
def grep_author_note(s): author = '' g = 3 while True: s = re.sub(eres['clean_author'], '', s) tmp = re.search(eres['author'], s) if tmp: # all the following mess is needed because of too slow regex engine # (he gives up and tries each possible combination if the # forbidden part is written inside of the `author' regex) # TODO handle situation '..., Surname A Paper Name ...' (where `A' # belongs to Paper Name...) if re.search(eres['author_forbidden_rest'], s[tmp.end(g):]): break ss = tmp.group(g) tmpp = re.search(eres['author_forbidden'], ss) if tmpp: maybe_empty = ss[:tmpp.start()] if author and maybe_empty: author += AUTHOR_JOIN author += maybe_empty s = s[tmp.start(g) + tmpp.start(2):] break else: if author: author += '; ' author += ss.strip(' \t') # save only the rest (drop prefixes like `and' `with' etc.) s = s[tmp.end(g):] else: break return author, s
def clean_line(line): line = strip_nikkud(line) replace_dict = {u'[.:\?]': u'', u'[”״]': u'"', u'[’׳]': u"'"} #note put \. in the file/ how can i check if it is right? line = multiple_replace(line, replace_dict, using_regex=True) # line = re.sub(u'[:\?]', '', line) # line = re.sub(u'”', u'"', line) reg_parentheses = re.compile(u'\((.*?)\)') reg_brackets = re.compile(u'\[(.*?)\]') in_per = reg_parentheses.search(line) in_bra = reg_brackets.search(line) reg_ayyen_tur = re.compile(u'''ו?(עיין|עי'|ע"ש) בטור''') reg_lo_manu = re.compile(u'''(?P<a>(\u05d0\u05da )?\u05dc\u05d0 \u05de\u05e0(.*?))(\u05e1\u05de"?\u05d2|\u05e8\u05de\u05d1"?\u05dd|\u05d8\u05d5\u05e8|\n)''') line = re.sub(u'\[.*?אלפס.*?\]', u'', line) line = re.sub(u'טור ו?שו"ע', u'טוש"ע', line) f_ayyen = re.search(reg_ayyen_tur, line) f_lo_manu = re.search(reg_lo_manu, line) if f_ayyen: line = line[:f_ayyen.start()] if f_lo_manu: line = re.sub(f_lo_manu.group('a'), u"", line) if in_per: if in_bra: clean = re.sub(reg_brackets, ur'\1', line) # brackets are always correct clean = re.sub(reg_parentheses, '', clean) else: clean = re.sub(reg_parentheses, ur'\1', line) elif in_bra: clean = re.sub(reg_brackets, ur'\1', line) # brackets are always correct else: clean = line return clean
def grep_year_note(s, reverse = False, with_paren = True): s = clean_start_end(s, start = False) s = re.sub(eres['clean_author'], '', s) if reverse: postfix = 'r' else: postfix = '' if not with_paren: postfix += '_no_paren' tmp = re.search(eres['year4' + postfix], s) if tmp: year = tmp.group(1) note = s[:tmp.start()] + s[tmp.end():] else: tmp = re.search(eres['year2' + postfix], s) if tmp: year = '20' + tmp.group(1) note = s[:tmp.start()] + s[tmp.end():] else: year = '' note = s return year, note
def __check_error(self, str): if str[0] in string.ascii_lowercase: # At least one occurence must be there for sure. m = regex.search(irrelevant_id_symbols, str) err_code = ERR_IRRELEVANT_SYMBOL err_pos = m.start() elif str[0] == 'R': if str[1] == 'T': if str[2] == '_': # At least one occurence must be there for sure. m = regex.search(irrelevant_id_symbols, str[3:]) err_pos = m.start() else: err_pos = 2 else: err_pos = 1 err_code = ERR_IRRELEVANT_SYMBOL elif str[0] == '"': if str.find('"', 1) == -1: err_code = ERR_QUOTES_NOT_CLOSED err_pos = 0 else: # At least one occurence must be there for sure. m = regex.search(irrelevant_str_symbols, str[1:]) err_code = ERR_IRRELEVANT_SYMBOL err_pos = m.start() + 1 else: err_code = ERR_IRRELEVANT_SYMBOL err_pos = 0 return (err_code, err_pos)
def main(): metaHtmlFilename = "Quaero/web/quaero_meta.html" standardHtmlFilename = "Quaero/web/quaero.html" latex2htmlLogFilename = "Quaero/doc/manual/html.log" metaHtml = open(metaHtmlFilename,'r') standardHtml = open(standardHtmlFilename,'w') classified=0 for line1 in metaHtml.readlines(): keys = re.search("helpwindow_MANUAL\((.*)\)\.html",line1) if(not keys==None): key = keys.group(1) key = regsub.gsub("\+","\\+",key) latex2htmlLog = open(latex2htmlLogFilename,'r') foundNodeNumber = 0 for line2 in latex2htmlLog.readlines(): nodeNumber = re.search('"'+key+'" for node([0-9]*)\.html',line2) if(not nodeNumber==None): line1 = regsub.gsub("helpwindow_MANUAL("+key+").html","manual/manual/node"+nodeNumber.group(1)+".html",line1) foundNodeNumber = 1 if(foundNodeNumber==0): print 'Key "'+key+'" not found.' latex2htmlLog.close() if regex.search("BeginClassified",line1) >= 0: classified=1 if regex.search("EndClassified",line1) >= 0: classified=0 if(classified==0): standardHtml.write(line1) if regex.search("</html>",line1) >= 0: break metaHtml.close() standardHtml.close()
def parse_implied_depth(self, element): ja_depth_pattern = ur"\[(\d)\]$" ja_sections_pattern = ur"\[(.*)\]$" title_str = element.get('text').strip() depth_match = re.search(ja_depth_pattern, title_str) if depth_match: depth = int(depth_match.group(1)) placeholder_sections = ['Volume', 'Chapter', 'Section', 'Paragraph'] element.set('text', re.sub(ja_depth_pattern, "", title_str)) return {'section_names': placeholder_sections[(-1 * depth):], 'address_types' : ['Integer'] * depth} sections_match = re.search(ja_sections_pattern, title_str) if sections_match: sections = [s.strip() for s in sections_match.group(1).split(",")] element.set('text', re.sub(ja_sections_pattern, "", title_str)) section_names = [] address_types = [] for s in sections: tpl = s.split(":") section_names.append(tpl[0]) address_types.append(tpl[1] if len(tpl) > 1 else 'Integer') return {'section_names': section_names, 'address_types' : address_types} else: return None
def get_sham_ref_with_node(st, node, lang='he'): """ used when you know the node that a sham ref belongs to in order to parse the ref according to the address types of that node :param st: string to search for sham ref :param node: node that we believe this sham ref belongs to :param lang: :return: """ title_sham = u'שם' title_reg = CitationFinder.get_ultimate_title_regex(title_sham, node, lang, compiled=True) if node.full_title() in [u'I Samuel', u'II Samuel', u'I Kings', u'II Kings', u'I Chronicles', u'II Chronicles']: volume = re.search(u'שם (א|ב)\s', st) m = re.search(title_reg, st) if volume: st1 = re.sub(u'(א|ב)\s', u'', st, count=1, pos=m.start()) m1 = re.search(title_reg, st1) if m1 and m1.groupdict()['a0'] and m1.groupdict()['a1']: node = CitationFinder.node_volumes(node, volume.group(1)) return CitationFinder.parse_sham_match(m1, lang, node) if m: return CitationFinder.parse_sham_match(m, lang, node) # there should be one and only one match else: title_reg = CitationFinder.get_ultimate_title_regex(title_sham, node, lang, compiled=True) m = re.search(title_reg, st) if m: return CitationFinder.parse_sham_match(m, lang, node) raise InputError
def strip_derived_rvs(rvs): '''Convenience fn: remove PyMC3-generated RVs from a list''' ret_rvs = [] for rv in rvs: if not (re.search('_log',rv.name) or re.search('_interval',rv.name)): ret_rvs.append(rv) return ret_rvs
def plot_phonemes(path): phoneme_embeddings = dict() for line in codecs.open(path,"r"): line = line.split(",") key= line[0][1:-1] emb = line[1:] emb[-1] = emb[-1][:-1] emb = np.array([float(e) for e in emb]) phoneme_embeddings[key] = emb phoneme_embeddings = DataFrame(phoneme_embeddings,columns=phoneme_embeddings.keys()) print(phoneme_embeddings.columns) m = TSNE() phoneme_embeddings_tsne = m.fit_transform(phoneme_embeddings.transpose()) print(len(phoneme_embeddings_tsne)) for p,emb in zip(phoneme_embeddings.columns, phoneme_embeddings_tsne): c = "black" if regex.search("^[aeiou3E][*]?$", p): c = "red" plt.annotate(p,(emb[0],emb[1]),color=c) if regex.search("^.*w~$", p): c = "blue" plt.annotate(p,(emb[0],emb[1]),color=c) if regex.search("^.*y~$", p): c = "yellow" plt.annotate(p,(emb[0],emb[1]),color=c) if regex.search("^.*h~$", p): c = "brown" plt.annotate(p,(emb[0],emb[1]),color=c) if regex.search("^.*\"$", p): c = "green" plt.annotate(p,(emb[0],emb[1]),color=c)
def parseaddr(address): # This is probably not perfect address = string.strip(address) # Case 1: part of the address is in <xx@xx> form. pos = regex.search('<.*>', address) if pos >= 0: name = address[:pos] address = address[pos:] length = regex.match('<.*>', address) name = name + address[length:] address = address[:length] else: # Case 2: part of the address is in (comment) form pos = regex.search('(.*)', address) if pos >= 0: name = address[pos:] address = address[:pos] length = regex.match('(.*)', name) address = address + name[length:] name = name[:length] else: # Case 3: neither. Only an address name = '' name = string.strip(name) address = string.strip(address) if address and address[0] == '<' and address[-1] == '>': address = address[1:-1] if name and name[0] == '(' and name[-1] == ')': name = name[1:-1] return name, address
def derive_new_rasag_halakhah_links(sources, generated_msg='rsg_sfm_linker'): """ This function returns links between the rasag and the halakhah links of the sources. the sources param are the "middleman" rasag-sources-halakhah and this function creates the links rasag-halakhah :param sources: a list of texts or categories on Sefaria :param generated_msg: a string to put on the link for the generated_by message :return: links rasag-halakhah """ new_links = [] source_links = get_links(sources) for link in source_links: rsg, otherref = link['refs'] if re.search("Sefer Hamitzvot", link['refs'][0]) else [link['refs'][1], link['refs'][0]] ls_otherref = LinkSet(Ref(otherref)).filter('Halakhah') # print otherref c = set([l.refs[0] for l in ls_otherref] + [l.refs[1] for l in ls_otherref]) # cluster_refs = [Ref(r) for r in c] # create_link_cluster(cluster_refs, 30044, "Sifrei Mitzvot", # attrs={"generated_by": "rsg_sfm_linker"}, exception_pairs=[("Tur", "Shulchan Arukh")]) c = c.difference({otherref, rsg}) for ref_string in list(c): if re.search("Sefer Hamitzvot", ref_string) or not re.search("Sefer Hamitzvot", rsg): continue link=({ "refs":[ rsg, ref_string ], "type": "Sifrei Mitzvot", "auto": True, "generated_by": generated_msg }) # print link['refs'] new_links.append(link) print len(new_links) return new_links
def parse_movie(search_name): """ Parse a movie name into name/year. :param search_name: release name :return: (name, year) """ result = regex.search('^(?P<name>.*)[\.\-_\( ](?P<year>19\d{2}|20\d{2})', search_name, regex.I) if result: result = result.groupdict() if 'year' not in result: result = regex.search( '^(?P<name>.*)[\.\-_ ](?:dvdrip|bdrip|brrip|bluray|hdtv|divx|xvid|proper|repack|real\.proper|sub\.?fix|sub\.?pack|ac3d|unrated|1080i|1080p|720p|810p)', search_name, regex.I) if result: result = result.groupdict() if 'name' in result: name = regex.sub('\(.*?\)|\.|_', ' ', result['name']) if 'year' in result: year = result['year'] else: year = '' return {'name': name, 'year': year} return None
def tabbed_output(columns, output_file, match_re_column_list=None, legacy_handling=None, match_any=False): if legacy_handling: match_re_column_list = [(match_re_column_list, legacy_handling)] try: if isinstance(match_re_column_list[0], str): match_re_column_list = [match_re_column_list] except TypeError: pass if isinstance(columns, dict): if not isinstance(columns, OrderedDict): raise TypeError("dictionary argument of 'tabbed_output()' must be an OrderedDict") if match_re_column_list: match_re_column_list = [(rx, columns[match_column]) for rx, match_column in match_re_column_list] columns = columns.values() lengths = [len(column) for column in columns] if len(set(lengths)) == 1: for i in range(lengths[0]): for column in columns: if isinstance(column[i], str): column[i] = [column[i]] column[i] = "‣".join(column[i]) line = "\t".join([clean_str(column[i]) for column in columns]) if match_re_column_list: if match_any: if any([re.search(rx, match_column[i]) for rx, match_column in match_re_column_list]): print(line, file=output_file) else: if all([re.search(rx, match_column[i]) for rx, match_column in match_re_column_list]): print(line, file=output_file) else: print(line, file=output_file) else: raise IndexError("first argument of 'tabbed_output()' must be an OrderedDict or list of identical-length lists")
def plot_traces_pymc(trcs, varnames=None): ''' Convenience fn: plot traces with overlaid means and values Handle nested traces for hierarchical models ''' nrows = len(trcs.varnames) if varnames is not None: nrows = len(varnames) ax = pm.traceplot(trcs, varnames=varnames, figsize=(12, nrows*1.4), lines={k: v['mean'] for k, v in pm.df_summary(trcs,varnames=varnames).iterrows()}, combined=True) # don't label the nested traces (a bit clumsy this: consider tidying) dfmns = pm.df_summary(trcs, varnames=varnames)['mean'].reset_index() dfmns.rename(columns={'index':'featval'}, inplace=True) dfmns = dfmns.loc[dfmns['featval'].apply(lambda x: re.search('__[1-9]{1,}', x) is None)] dfmns['draw'] = dfmns['featval'].apply(lambda x: re.search('__0{1}$', x) is None) dfmns['pos'] = np.arange(dfmns.shape[0]) dfmns.set_index('pos', inplace=True) for i, r in dfmns.iterrows(): if r['draw']: ax[i,0].annotate('{:.2f}'.format(r['mean']), xy=(r['mean'],0) ,xycoords='data', xytext=(5,10) ,textcoords='offset points', rotation=90 ,va='bottom', fontsize='large', color='#AA0022')
def scrape_wiki(): url = u"https://he.wikipedia.org/wiki/%D7%9E%D7%A0%D7%99%D7%99%D7%9F_%D7%94%D7%9E%D7%A6%D7%95%D7%95%D7%AA_%D7%A2%D7%9C_%D7%A4%D7%99_%D7%A1%D7%A4%D7%A8_%D7%94%D7%97%D7%99%D7%A0%D7%95%D7%9A" page = requests.get(url) soup_body = BeautifulSoup(page.text, "lxml") tables = soup_body.select(".mw-parser-output > table") pairs = [] links = [] for table in tables: table_tr = table.select("tr") for col in table_tr: pairs.append((col.contents[1].text.strip(), re.sub(u'</?td>', u'', col.contents[-1].text).strip())) for pair in pairs: if re.search(u'ספר|מספר', pair[0]): continue neg_pos = u"Negative Mitzvot" if re.search(u"לאו", pair[1]) else u'Positive Mitzvot' rambam = getGematria(re.sub(u'עשה|לאו', u'', pair[1]).strip()) chinukh = getGematria(pair[0]) print chinukh, rambam chinukh_simanlen = len(Ref(u'Sefer HaChinukh.{}'.format(chinukh)).all_segment_refs()) print neg_pos link = ({"refs": [ u'Sefer HaChinukh.{}.{}-{}'.format(chinukh, 1, chinukh_simanlen), u'Mishneh Torah, {}.{}'.format(neg_pos, rambam) ], "type": "Sifrei Mitzvot", "auto": True, "generated_by": "chinukh_rambam_sfm_linker" # _sfm_linker what is this parametor intended to be? }) print link['refs'] links.append(link) return links
def lcc_range(string): """ Takes a string, returns a tuple of two LCClassNumbers, the start and end of the range. """ string = string.encode("ascii","replace") string = string.replace("(","") string = string.replace(")","") if string.endswith("A-Z"): # TMI in the schedules when they're alphabetical. # I don't care. string.replace("A-Z","") if "-" not in string: # A range of self length. return (LCCallNumber(string), LCCallNumber(string)) parts = string.split("-") if re.search(r"^\d",parts[1]): header = re.sub("^([A-Z]+).*",r"\1",parts[0]) elif re.search(r"^\.",parts[1]): header = re.sub(r"^([A-Z]+\d)+\..*",r"\1",parts[0]) elif re.search(r"^[A-Z]",parts[1]): header = re.sub(r"^([A-Z]+\d)+\..[A-Z]*",r"\1.",parts[0]) else: header = " " parts[1] = header + parts[1] return ( LCCallNumber(parts[0]), LCCallNumber(parts[1]) )
def _correct_splitlines_in_headers(markers, lines): """ Corrects markers by removing splitlines deemed to be inside header blocks. """ updated_markers = "" i = 0 in_header_block = False for m in markers: # Only set in_header_block flag when we hit an 's' and line is a header if m == 's': if not in_header_block: if bool(re.search(RE_HEADER, lines[i])): in_header_block = True else: if QUOT_PATTERN.match(lines[i]): m = 'm' else: m = 't' # If the line is not a header line, set in_header_block false. if not bool(re.search(RE_HEADER, lines[i])): in_header_block = False # Add the marker to the new updated markers string. updated_markers += m i += 1 return updated_markers
def tabularize(source, output_file, match_re_lv_list=None, lv_list=None, tagged=False, tag_types=set(), inc_df=False): if not lv_list: lv_list = sorted(set(flatten([mn.lv_list() for mn in source]))) try: if isinstance(match_re_lv_list[0], str): match_re_lv_list = [match_re_lv_list] except TypeError: pass try: l = tqdm(source) except NameError: l = self for mn in l: line = tab_line(mn, lv_list, tagged, tag_types) if match_re_lv_list: match = False for rx, match_lv in match_re_lv_list: for ex in [dn.ex for dn in mn.dn_list if dn.lv == match_lv]: if re.search(rx, clean_str(str(ex))): match = True if inc_df: for df in [df for df in mn.df_list if df.lv == match_lv]: if re.search(rx, clean_str(str(df))): match = True if match and clean_str(line): print(line, file=output_file) elif clean_str(line): print(line, file=output_file) else: pass
def tokenize(text): # This line changes tabs into spaces text = re.sub(r"\t", " ", text) # put blanks around characters that are unambiguous separators text = re.sub(always_sep, r" \g<0> ", text) # if a word is a separator in the beginning of a token separate it here text = re.sub("^" + begin_sep, r"\g<0> ", text) text = re.sub(" " + begin_sep, r"\g<0> ", text) text = re.sub("(" + not_letter + ")(" + begin_sep + ")", r"\1 \2", text) # idem for final separators text = re.sub(end_sep + r"\s", r" \g<0>", text) text = re.sub(end_sep + "(" + not_letter + ")", r"\1 \2", text) # the end separator is already between parentheses and is stored in $1 # This line divides the input line and assigns it to elements of an array all_words = text.split() words = [] # We examine all the elements for word in all_words: # if it contains a letter followed by a period, if re.search(letter + r"\.", word): # we see if it is an abbreviation # if it is explicitly found in the abbreviation list if word not in abbr: # or matches the regular expression below, we keep the period attached (possible acronyms) if not re.search(r"^(\p{L}\.(\p{L}\.)+|\p{Lu}[bcdfghj-np-tvxz]+\.)$", word): # if not, a space is inserted before the period word = re.sub(r"\.$", r" .", word) # Change all spaces to new lines word = re.sub(r"[ \t]+", r"\n", word) # Append the current word words.append(word) return words
def simple_rule(rule, targets): """Is this rule a simple rule? A simple rule rewrites a single hostname, perhaps with an optional leading www\., to itself or to itself plus www., at the top level with no other effects.""" rule_from = rule.attrib["from"] rule_to = rule.attrib["to"] # Simple rule with no capture if regex.match(r"^\^http://[-A-Za-z0-9.\\]+/$", rule_from): applicable_host = unescape(regex.search(r"^\^http://([-A-Za-z0-9.\\])+/$", rule_from).groups()[0]) if regex.match(r"^https://%s/" % applicable_host, rule_to) or regex.match( "r^https://%s/" % applicable_host, rule_to ): return True else: return False # Optional www if regex.match(r"^\^http://\(www\\\.\)\?[-A-Za-z0-9.\\]+/$", rule_from): applicable_host = unescape(regex.search(r"^\^http://\(www\\\.\)\?([-A-Za-z0-9.\\]+)/$", rule_from).groups()[0]) if regex.match(r"^https://www\.%s/" % applicable_host, rule_to) or regex.match( r"^https://%s/" % applicable_host, rule_to ): return True else: return False return False
def parse_semag(self, str, mass): # split = re.split('\s', str.strip()) reg_book = re.compile(u'ו?(עשין|שם|לאוין)') split = re.split(reg_book, str.strip()) # str_list = filter(None, split) str_list = filter(None, [item.strip() for item in split]) resolveds = [] # it = iter(str_list) derabanan_flag = False book = None for i, word in enumerate(str_list): if derabanan_flag: derabanan_flag = False resolved = self._tracker.resolve(book, [1]) resolveds.append(resolved) continue elif re.search(reg_book, word): # book = word # if book == u'שם': # book = None # elif book == u'לאוין': # book = u'Sefer Mitzvot Gadol, Volume One' try: if word != u'שם': derabanan = filter(None, [item.strip() for item in re.split(u'(מד"ס|מ?דרבנן)',str_list[i+1].strip())]) except IndexError: # mass.ErrorFile.write('error smg, no place in book notation') mass.error_flag = 'error smg, no place in book notation' print 'error smg, no place in book notation' return if word == u'עשין' and len(derabanan) > 1 and (derabanan[0] != u"סימן"): book = re.search(u'[א-ה]',derabanan[1]) # print book.group(0) book = self._table[book.group(0)] derabanan_flag = True elif re.match(reg_book, word): book = self._table[word] else: mass.ErrorFile.write("error smg, don't recognize book name") print "error smg, don't recognize book name", book return else: mitzva = re.split('\s', word) for m in mitzva: # if m == u'סימן': # continue if m == u'שם': m = None elif getGematriaVav(m): m = getGematriaVav(m) else: m = None resolved = self._tracker.resolve(book, [m]) resolveds.append(resolved) if not resolveds: resolved = self._tracker.resolve(book, [None]) resolveds.append(resolved) # print resolveds return resolveds
def create_activation(data, labels, standard_cols, group_labels=[]): activation = database.Activation() for i, col in enumerate(data): # Cast to integer or float if appropriate # if regex.match('[-\d]+$', col): # col = int(col) # elif regex.match('[-\d\.]+$', col): # col = float(col) # Set standard attributes if applicable and do validation where appropriate. # Generally, validation will not prevent a bad value from making it into the # activation object, but it will flag any potential issues using the "problem" column. if standard_cols[i] is not None: sc = standard_cols[i] # Validate XYZ columns: Should only be integers (and possible trailing decimals). # If they're not, keep only leading numbers. The exception is that ScienceDirect # journals often follow the minus sign with a space (e.g., - 35), which we strip. if regex.match('[xyz]$', sc): m = regex.match('(-)\s+(\d+\.*\d*)$', col) if m: col = "%s%s" % (m.group(1), m.group(2)) if not regex.match('(-*\d+)\.*\d*$', col): logging.debug("Value %s in %s column is not valid" % (col, sc)) activation.problems.append("Value in %s column is not valid" % sc) # col = regex.search('(-*\d+)', col).group(1) return activation col = (float(col)) elif sc == 'region': if not regex.search('[a-zA-Z]', col): logging.debug("Value in region column is not a string") activation.problems.append("Value in region column is not a string") setattr(activation, sc, col) # Always include all columns in record activation.add_col(labels[i], col) # Handle columns with multiple coordinates (e.g., 45;12;-12). # Assume that any series of 3 numbers in a non-standard column # reflects coordinates. Will fail if there are leading numbers!!! # Also need to remove space between minus sign and numbers; some ScienceDirect # journals leave a gap. if not i in standard_cols: cs = '([\-\.\s]*\d{1,3})' m = regex.search('%s[,;\s]+%s[,;\s]+%s' % (cs, cs, cs), unicode(col).strip()) if m: x, y, z = [regex.sub('-\s+', '-', c) for c in [m.group(1), m.group(2), m.group(3)]] logger.info("Found multi-coordinate column: %s\n...and extracted: %s, %s, %s" % (col, x, y, z)) activation.set_coords(x, y, z) activation.groups = group_labels return activation
def parse_semag(self, str, mass): reg_book = re.compile(u'ו?ב?(עשין|שם|לאוין|לאין)') split = re.split(reg_book, str.strip()) str_list = filter(None, [item.strip() for item in split]) resolveds = [] derabanan_flag = False book = None reg_siman = u"סי'?|סימן" reg_vav = u'ו{}'.format(reg_siman) for i, word in enumerate(str_list): if derabanan_flag: derabanan_flag = False # resolved = self._tracker.resolve(book, [1]) resolved = resolveExceptin(self._tracker, book, [1]) resolveds.append(resolved) continue elif re.search(reg_book, word): try: if word != u'שם': derabanan = filter(None, [item.strip() for item in re.split(u'(מד"ס|מ?דרבנן)',str_list[i+1].strip())]) except IndexError: mass.write_shgia('error smg, no place in book notation') return if word == u'עשין' and len(derabanan) > 1: book = re.search(u'[א-ה]',derabanan[1]) book = self._table[book.group(0)] derabanan_flag = True elif re.match(reg_book, word): book = self._table[word] else: mass.write_shgia("error smg, don't recognize book name") return else: mitzva = re.split('\s', word) for m in mitzva: if re.search(reg_vav, m) and not book: # resolved = self._tracker.resolve(book, [None]) resolved = resolveExceptin(self._tracker, book, [None]) resolveds.append(resolved) if m == u'ו?שם': m = None elif re.search(reg_siman, m): continue elif getGematriaVav(m, mass): m = getGematriaVav(m, mass) else: m = None # resolved = self._tracker.resolve(book, [m]) resolved = resolveExceptin(self._tracker, book, [m]) resolveds.append(resolved) if not resolveds: # resolved = self._tracker.resolve(book, [None]) resolved = resolveExceptin(self._tracker, book, [None]) resolveds.append(resolved) if len([item for item in resolveds if not isinstance(item, Ref)]) > 0: mass.write_shgia(u'error from ibid in Ref or table none problem') return resolveds
def _sub_symbols(self, line): while re.search(r'::\w+::', line): s = re.search(r'::\w+::', line).group(0) if s in self.symbols: line = line.replace(s, self.symbols[s]) else: raise RuleFileError('Undefined symbol: {}'.format(s)) return line
def is_blacklisted(part, group_name, blacklists): for blacklist in blacklists: if regex.search(blacklist.group_name, group_name): # too spammy # log.debug('{0}: Checking blacklist {1}...'.format(group_name, blacklist['regex'])) if regex.search(blacklist.regex, part[blacklist.field]): return True return False
def use_type_get_index(self, st): address_regex = CitationFinder.get_ultimate_title_regex(title=u"שם", node=None, lang='he') # address_regex = CitationFinder.create_or_address_regexes(lang='he') m = re.search(address_regex, st) for k, v in m.groupdict().items(): if v and not re.search("Title|a0|a1", k): address_type = k return address_type
def seferHamitzvot_from_rasag_comm(rasagCsvName, with_orig = False): # ind_rasag_comm = library.get_index("Commentary on Sefer Hamitzvot of Rasag") segments = Ref('Commentary_on_Sefer_Hamitzvot_of_Rasag,_Positive_Commandments').all_segment_refs() segments.extend(Ref('Commentary_on_Sefer_Hamitzvot_of_Rasag,_Negative_Commandments').all_segment_refs()) segments.extend(Ref('Commentary_on_Sefer_Hamitzvot_of_Rasag,_Laws_of_the_Courts').all_segment_refs()) segments.extend(Ref('Commentary_on_Sefer_Hamitzvot_of_Rasag,_Communal_Laws').all_segment_refs()) cnt = {"Rasag":0, "Sefer HaMitzvot":0, "Semag":0, "Semak":0} dict_list = [] for seg in segments: # sfHmtzvot = re.search(u'(?:ספר המצו?ות|סה"מ).{1,4}(עשין|לאוין|עשה|לא תעשה).{0,20}', seg.text('he').text) sfHmtzvot = re.search(u'(?:ספר המצוות|סה"מ)\s{1,4}\((.*?)\)', seg.text('he').text) smg = re.search(u'סמ"ג \((.*?)\)', seg.text('he').text) smk = re.search(u'סמ"ק (\(.*?\))', seg.text('he').text) row_dict = {} row_orig = {} if sfHmtzvot: # row_dict["Rasag"] = re.search("(Sefer.*?\d*?):", seg.normal()).group(1) # row_orig["Rasag"] = re.search("(Sefer.*?\d*?):", seg.normal()).group(1) kind, simanim = rasag_exctractor(sfHmtzvot.group(1)) # row_dict["Sefer HaMitzvot"] = ['Sefer HaMitzvot, {}.{}'.format(kind, siman) for siman in simanim] if kind: row_dict["Sefer HaMitzvot"] = 'Sefer HaMitzvot, {}.{}'.format(kind, simanim[0]) else: print "no kind", sfHmtzvot.group(1) row_orig["Sefer HaMitzvot"] = sfHmtzvot.group() cnt["Sefer HaMitzvot"] += 1 if smg: # row_dict["Rasag"] = re.search("(Sefer.*?\d*?):", seg.normal()).group(1) kind, simanim = rasag_exctractor(smg.group(1)) # row_dict["Semag"] = ['Sefer Mitzvot Gadol, {}.{}'.format(kind, siman) for siman in simanim] if kind: row_dict["Semag"] = 'Sefer Mitzvot Gadol, {}.{}'.format(kind, simanim[0]) else: print "no kind", smg.group(1) row_orig["Semag"] = smg.group() cnt["Semag"] += 1 if smk: # row_dict["Rasag"] = re.search("(Sefer.*?\d*?):", seg.normal()).group(1) # simanim = siman_smk_exctractor(smk.group(1)) smki = re.search(u"ב?סי'\s+(.*?)(?:\s*\))", smk.group(1)) if smki: siman = getGematria(smki.group(1)) row_dict["Semak"] = "Sefer Mitzvot Katan.{}".format(siman) row_orig["Semak"] = smk.group() cnt["Semak"] += 1 else: print u'***siman***' + smk.group() if row_dict: cnt["Rasag"] += 1 row_dict["Rasag"] = re.search("(Sefer.*?\d*?):", seg.normal()).group(1) row_orig["Rasag"] = seg.normal() if with_orig: dict_list.append(row_orig) dict_list.append(row_dict) toCsv(rasagCsvName, ["Rasag", "Sefer HaMitzvot", "Semag", "Semak"], dict_list) print cnt
def evaluate(self,scope, locals, block=None): #self.data = re.sub('\A[\n\r\s]+','',self.data) blank_pattern = re.compile(r"""\A\s*\Z""",re.M) end_pattern = re.compile(r""";\s*\Z""",re.M) if re.search(blank_pattern,self.data) or re.search(end_pattern,self.data): return self.data #return "%s\n" % self.data if self.data != "" and not self.data.endswith('\n') else self.data else: return "%s;\n" % self.data
def isYelling(message): isYelling = False if (re.search(u'[A-Z]', message)): if not (re.search(u'[a-z]', message)): if (re.compile(u'\p{Ll}', re.UNICODE).search(message)): ifYelling = False else: isYelling = True return isYelling
def parse_implicit_date(self, source: str, reference: datetime) -> DateTimeParseResult: trimmed_source = source.strip() result = DateTimeResolutionResult() # handle "on 12" match = regex.search(self.config.on_regex, self.config.date_token_prefix + trimmed_source) if match and match.start() == len( self.config.date_token_prefix) and len( match.group()) == len(trimmed_source): day = 0 month = reference.month year = reference.year day_str = match.group('day') day = self.config.day_of_month.get(day_str) result.timex = DateTimeFormatUtil.luis_date(-1, -1, day) try_str = DateTimeFormatUtil.luis_date(year, month, day) try_date = datetime.strptime(try_str, '%Y-%m-%d') future_date: datetime past_date: datetime if try_date: future_date = DateUtils.safe_create_from_min_value( year, month, day) past_date = DateUtils.safe_create_from_min_value( year, month, day) if future_date < reference: future_date += datedelta(months=1) if past_date >= reference: past_date += datedelta(months=-1) else: future_date = DateUtils.safe_create_from_min_value( year, month + 1, day) past_date = DateUtils.safe_create_from_min_value( year, month - 1, day) result.future_value = future_date result.past_value = past_date result.success = True return result # handle "today", "the day before yesterday" match = regex.match(self.config.special_day_regex, trimmed_source) if match and match.start() == 0 and len( match.group()) == len(trimmed_source): swift = self.config.get_swift_day(match.group()) value = reference + timedelta(days=swift) result.timex = DateTimeFormatUtil.luis_date_from_datetime(value) result.future_value = value result.past_value = value result.success = True return result # handle "next Sunday" match = regex.match(self.config.next_regex, trimmed_source) if match and match.start() == 0 and len( match.group()) == len(trimmed_source): weekday_str = match.group('weekday') value = DateUtils.next(reference, self.config.day_of_week.get(weekday_str)) result.timex = DateTimeFormatUtil.luis_date_from_datetime(value) result.future_value = value result.past_value = value result.success = True return result # handle "this Friday" match = regex.match(self.config.this_regex, trimmed_source) if match and match.start() == 0 and len( match.group()) == len(trimmed_source): weekday_str = match.group('weekday') value = DateUtils.this(reference, self.config.day_of_week.get(weekday_str)) result.timex = DateTimeFormatUtil.luis_date_from_datetime(value) result.future_value = value result.past_value = value result.success = True return result # handle "last Friday", "last mon" match = regex.match(self.config.last_regex, trimmed_source) if match and match.start() == 0 and len( match.group()) == len(trimmed_source): weekday_str = match.group('weekday') value = DateUtils.last(reference, self.config.day_of_week.get(weekday_str)) result.timex = DateTimeFormatUtil.luis_date_from_datetime(value) result.future_value = value result.past_value = value result.success = True return result # handle "Friday" match = regex.match(self.config.week_day_regex, trimmed_source) if match and match.start() == 0 and len( match.group()) == len(trimmed_source): weekday_str = match.group('weekday') weekday = self.config.day_of_week.get(weekday_str) value = DateUtils.this(reference, weekday) if weekday < int(DayOfWeek.Monday): weekday = int(DayOfWeek.Sunday) if weekday < reference.isoweekday(): value = DateUtils.next(reference, weekday) result.timex = 'XXXX-WXX-' + str(weekday) future_date = value past_date = value if future_date < reference: future_date += timedelta(weeks=1) if past_date >= reference: past_date -= timedelta(weeks=1) result.future_value = future_date result.past_value = past_date result.success = True return result # handle "for the 27th." match = regex.match(self.config.for_the_regex, trimmed_source) if match: day_str = match.group('DayOfMonth') er = ExtractResult.get_from_text(day_str) day = int(self.config.number_parser.parse(er).value) month = reference.month year = reference.year result.timex = DateTimeFormatUtil.luis_date(-1, -1, day) date = datetime(year, month, day) result.future_value = date result.past_value = date result.success = True return result # handling cases like 'Thursday the 21st', which both 'Thursday' and '21st' refer to a same date match = regex.match(self.config.week_day_and_day_of_month_regex, trimmed_source) if match: day_str = match.group('DayOfMonth') er = ExtractResult.get_from_text(day_str) day = int(self.config.number_parser.parse(er).value) month = reference.month year = reference.year # the validity of the phrase is guaranteed in the Date Extractor result.timex = DateTimeFormatUtil.luis_date(year, month, day) date = datetime(year, month, day) result.future_value = date result.past_value = date result.success = True return result return result
def OutputStudy(self, out_file, num_sections, has_next, main_words, extra_word_lists, body_dbm, uniq_words, out_children): def P(*args, end="\n"): esc_args = [] for arg in args[1:]: if isinstance(arg, str): arg = esc(arg) esc_args.append(arg) print(args[0].format(*esc_args), end=end, file=out_file) def PrintNavi(): P('<div class="navi">') P('<a href="index.xhtml">TOP</a>') check_url = "check-{:03d}.xhtml".format(num_sections) P('<a href="{}">CHECK</a>', check_url) if num_sections == 1: P('<span class="void">PREV</span>') else: prev_url = "study-{:03d}.xhtml".format(num_sections - 1) P('<a href="{}">PREV</a>', prev_url) if has_next: next_url = "study-{:03d}.xhtml".format(num_sections + 1) P('<a href="{}">NEXT</a>', next_url) else: P('<span class="void">NEXT</span>') P('</div>') P('<?xml version="1.0" encoding="UTF-8"?>') P('<!DOCTYPE html>') P('<html xmlns="http://www.w3.org/1999/xhtml">') P('<head>') P('<meta charset="UTF-8"/>') P('<meta name="viewport" content="width=device-width"/>') P('<title>{}: Chapter {} Study</title>', self.title, num_sections) P('<link rel="stylesheet" href="style.css"/>') P('</head>') P('<body>') P('<article>') PrintNavi() P('<h1><a href="">Chapter {} Study</a></h1>', num_sections) num_words = 0 for surface, aliases in main_words: entry = None data = body_dbm.GetStr(surface) if data: entries = json.loads(data) for word_entry in entries: if word_entry["word"] == surface: entry = word_entry break if not entry: P('<p>Warning: no data for {}</p>', surface) continue uniq_words[surface] = num_sections num_words += 1 word_id = ConvertWordToID(surface) P('<section id="{}" class="entry">', word_id) P('<div class="num">{:02d}</div>', num_words) P('<div class="head">') P('<a href="#{}" class="word">{}</a>', word_id, surface) pron = entry.get("pronunciation") if pron: P('<span class="pron">{}</span>', pron) P('</div>', surface) trans = entry.get("translation") if trans: P('<div class="trans">{}</div>', ", ".join(trans[:8])) first_label = None num_items = 0 poses = set() for item in entry["item"]: label = item["label"] pos = item["pos"] text = item["text"] if regex.search(r"^\[translation\]", text): continue if num_items >= 10: break if first_label and label != first_label: break first_label = label parts = [] for part in text.split("[-]"): part = part.strip() parts.append(part) if not parts: continue num_items += 1 main_text = CutTextByWidth(parts[0], 128) poses.add(pos) P('<div class="text">') P('<span class="attr">{}</span>', POSES.get(pos) or pos) P('<span>{}</span>', main_text) P('</div>') synonyms = [] examples = [] for part in parts[1:]: match = regex.search(r"\[synonym\]: (.*)", part) if match: synonyms.append(match.group(1).strip()) match = regex.search(r"\e.g.: (.*)", part) if match: examples.append(match.group(1).strip()) for text in synonyms: text = CutTextByWidth(text, 128) P('<div class="aux">') P('<span class="auxattr">≒</span>') P('<span>{}</span>', text) P('</div>') for text in examples[:1]: text = CutTextByWidth(text, 128) P('<div class="aux">') P('<span class="auxattr">・</span>') P('<span>{}</span>', text) P('</div>') parents = entry.get("parent") children = entry.get("child") sibling_alts = set((parents or []) + (children or [])) if children: for child in list(children): child_data = body_dbm.GetStr(child) if not child_data: continue child_entries = json.loads(child_data) for child_entry in child_entries: if child_entry["word"] != child: continue grand_children = child_entry.get("child") if grand_children: for grand_child in grand_children: if grand_child not in children: children.append(grand_child) phrases = entry.get("phrase") or [] for label, derivatives in (("語幹", parents), ("派生", children)): if not derivatives: continue for child in derivatives: if not regex.search(r"^[a-zA-Z]", child): continue if child in uniq_words: continue uniq_words[child] = num_sections child_trans = None child_poses = None child_data = body_dbm.GetStr(child) if child_data: child_entries = json.loads(child_data) child_prob = 0 for child_entry in child_entries: if child_entry["word"] != child: continue child_prob = float(child_entry.get("probability") or 0.0) us_hit = IsGbAlternative(child_entry, sibling_alts, body_dbm) if us_hit: continue child_trans = child_entry.get("translation") child_poses = self.GetEntryPOSList(child_entry) child_phrases = child_entry.get("phrase") or [] if child_phrases: phrases.extend(child_phrases) break if not child_trans: continue if self.child_min_prob > 0 and child_prob < self.child_min_prob: continue P('<div class="child">') P('<span class="attr">{}</span>', label) for child_pos in child_poses[:2]: P('<span class="attr subattr">{}</span>', POSES.get(child_pos) or child_pos) child_id = ConvertWordToID(child) P('<span id="{}" class="subword">{}</span>', child_id, child_id, child) P('<span class="childtrans">: {}</span>', ", ".join(child_trans[:4])) P('</div>') out_children[surface].append((child, child_trans)) if phrases: for phrase in phrases: if not phrase.get("i"): continue phrase_word = phrase.get("w") if not regex.search(r"^[a-zA-Z]", phrase_word): continue if phrase_word in uniq_words: continue uniq_words[phrase_word] = num_sections phrase_data = body_dbm.GetStr(phrase_word) if not phrase_data: continue phrase_entries = json.loads(phrase_data) phrase_trans = None phrase_poses = None phrase_prob = 0 for phrase_entry in phrase_entries: if phrase_entry["word"] != phrase_word: continue phrase_prob = float(phrase_entry.get("probability") or 0.0) phrase_trans = phrase_entry.get("translation") phrase_poses = self.GetEntryPOSList(phrase_entry) break if not phrase_trans: continue if self.child_min_prob > 0 and phrase_prob < self.child_min_prob: continue P('<div class="child">') P('<span class="attr">句</span>') for phrase_pos in phrase_poses[:2]: P('<span class="attr subattr">{}</span>', POSES.get(phrase_pos) or phrase_pos) phrase_id = ConvertWordToID(phrase_word) P('<span href="#{}" id="{}" class="subword">{}</span>', phrase_id, phrase_id, phrase_word) P('<span class="childtrans">: {}</span>', ", ".join(phrase_trans[:4])) P('</div>') infls = [] for name, label in INFLECTIONS: prefix = name[:name.find("_")] if prefix not in poses: continue value = entry.get(name) if not value: continue infls.append((label, value)) uniq_infls = set() for alias in aliases: if alias in uniq_infls: continue uniq_infls.add(alias) infls.append(("代替", alias)) alternatives = entry.get("alternative") if alternatives: for alt in alternatives: if alt in uniq_infls: continue uniq_infls.add(alt) infls.append(("代替", alt)) if infls: P('<div class="meta">') for label, value in infls: P('<span class="attr">{}</span>', label) P('<span class="metavalue">{}</span>', value) P('</div>') P('</section>') extra_words = [] for extra_word_list in extra_word_lists: num_extra_words = 0 for extra_word in extra_word_list: if num_extra_words >= self.num_extra_items: break if extra_word in uniq_words: continue extra_trans = [] extra_poses = [] extra_data = body_dbm.GetStr(extra_word) if extra_data: extra_entries = json.loads(extra_data) for extra_entry in extra_entries: if extra_entry["word"] != extra_word: continue extra_trans.extend(extra_entry.get("translation") or []) extra_poses.extend(self.GetEntryPOSList(extra_entry)) if not extra_trans: continue extra_trans = extra_trans[:5] has_good_tran = False for extra_tran in extra_trans: if regex.search(r"[\p{Han}\p{Hiragana}\p{Katakana}]", extra_tran): has_good_tran = True if not has_good_tran: continue extra_words.append((extra_word, extra_trans, extra_poses)) uniq_words[extra_word] = num_sections num_extra_words += 1 if extra_words: P('<section class="entry">') P('<div class="num">Bonus Words</div>') for extra_word, extra_trans, extra_poses in extra_words: P('<div class="extra">') word_id = ConvertWordToID(extra_word) P('<span id="{}" class="subword">{}</span> :', word_id, word_id, extra_word) for extra_pos in extra_poses[:2]: P('<span class="attr subattr">{}</span>', POSES.get(extra_pos) or extra_pos) P('<span class="childtrans">{}</span>', ", ".join(extra_trans)) P('</div>') P('</section>') PrintNavi() P('</article>') P('</body>') P('</html>')
def regex_find(string, pattern): match = regex.search(pattern, string, flags=re.IGNORECASE) return match is not None
def is_hebrew(s): if regex.search(u"\p{Hebrew}", s): return True return False
try: title = driver.find_element_by_xpath(f'//*[@id="body"]/div/div[2]/ul/li[{i}]/div/a/div[2]').text print(title) except: num+=1 i=1 continue for word in wordlist: if word in title: print("ERROR") ERROR = True break # check number if (re.search(r'\d', title)): print("NUMBER ERROR") ERROR = True # typ = driver.find_element_by_xpath(f'//*[@id="main-content"]/div/main/article[{i}]/div/div/p/a').text # if 'DINNER' not in typ: # print('SIDE ERROR') # ERROR = True if ERROR: i+=1 continue
def parse_implicit_date(self, source: str, reference: datetime) -> DateTimeParseResult: trimmed_source = source.strip() result = DateTimeResolutionResult() # handle "on 12" match = regex.search(self.special_date_regex, trimmed_source) if match and len(match.group()) == len(trimmed_source): day = 0 month = reference.month year = reference.year year_str = RegExpUtility.get_group(match, 'thisyear') month_str = RegExpUtility.get_group(match, 'thismonth') day_str = RegExpUtility.get_group(match, 'day') day = self.config.day_of_month.get(day_str, -1) has_year = year_str.strip() != '' has_month = month_str.strip() != '' if has_month: if regex.search(self.token_next_regex, month_str): month += 1 if month == Constants.MaxMonth + 1: month = Constants.MinMonth year += 1 elif regex.search(self.token_last_regex, month_str): month -= 1 if month == Constants.MinMonth - 1: month = Constants.MaxMonth year -= 1 if has_year: if regex.search(self.token_next_regex, year_str): year += 1 elif regex.search(self.token_last_regex, year_str): year -= 1 result.timex = DateTimeFormatUtil.luis_date( year if has_year else -1, month if has_month else -1, day) future_date: datetime past_date: datetime if day > self.get_month_max_day(year, month): futureMonth = month + 1 pastMonth = month - 1 futureYear = year pastYear = year if futureMonth == Constants.MaxMonth + 1: futureMonth = Constants.MinMonth futureYear = year + 1 if pastMonth == Constants.MinMonth - 1: pastMonth = Constants.MaxMonth pastYear = year - 1 isFutureValid = DateUtils.is_valid_date( futureYear, futureMonth, day) isPastValid = DateUtils.is_valid_date(pastYear, pastMonth, day) if isFutureValid and isPastValid: future_date = DateUtils.safe_create_from_min_value( futureYear, futureMonth, day) past_date = DateUtils.safe_create_from_min_value( pastYear, pastMonth, day) elif isFutureValid and not isPastValid: future_date = past_date = DateUtils.safe_create_from_min_value( futureYear, futureMonth, day) elif not isFutureValid and not isPastValid: future_date = past_date = DateUtils.safe_create_from_min_value( pastYear, pastMonth, day) else: future_date = past_date = DateUtils.safe_create_from_min_value( year, month, day) else: future_date = DateUtils.safe_create_from_min_value( year, month, day) past_date = DateUtils.safe_create_from_min_value( year, month, day) if not has_month: if future_date < reference: if self.is_valid_date(year, month + 1, day): future_date += datedelta(months=1) if past_date >= reference: if self.is_valid_date(year, month - 1, day): past_date += datedelta(months=-1) elif self.is_non_leap_year_Feb_29th( year, month - 1, day): past_date += datedelta(months=-2) elif has_month and not has_year: if future_date < reference: if self.is_valid_date(year + 1, month, day): future_date += datedelta(years=1) if past_date >= reference: if self.is_valid_date(year - 1, month, day): past_date += datedelta(years=-1) result.future_value = future_date result.past_value = past_date result.success = True return result # handle "today", "the day before yesterday" match = regex.match(self.config.special_day_regex, trimmed_source) if match and match.start() == 0 and len( match.group()) == len(trimmed_source): swift = self.config.get_swift_day(match.group()) value = reference + timedelta(days=swift) result.timex = DateTimeFormatUtil.luis_date_from_datetime(value) result.future_value = value result.past_value = value result.success = True return result # handle "this Friday" match = regex.match(self.config.this_regex, trimmed_source) if match and match.start() == 0 and len( match.group()) == len(trimmed_source): weekday_str = RegExpUtility.get_group(match, 'weekday') value = DateUtils.this(reference, self.config.day_of_week.get(weekday_str)) result.timex = DateTimeFormatUtil.luis_date_from_datetime(value) result.future_value = value result.past_value = value result.success = True return result # handle "next Sunday" match = regex.match(self.config.next_regex, trimmed_source) if match and match.start() == 0 and len( match.group()) == len(trimmed_source): weekday_str = RegExpUtility.get_group(match, 'weekday') value = DateUtils.next(reference, self.config.day_of_week.get(weekday_str)) result.timex = DateTimeFormatUtil.luis_date_from_datetime(value) result.future_value = value result.past_value = value result.success = True return result # handle "last Friday", "last mon" match = regex.match(self.config.last_regex, trimmed_source) if match and match.start() == 0 and len( match.group()) == len(trimmed_source): weekday_str = RegExpUtility.get_group(match, 'weekday') value = DateUtils.last(reference, self.config.day_of_week.get(weekday_str)) result.timex = DateTimeFormatUtil.luis_date_from_datetime(value) result.future_value = value result.past_value = value result.success = True return result # handle "Friday" match = regex.match(self.config.week_day_regex, trimmed_source) if match and match.start() == 0 and len( match.group()) == len(trimmed_source): weekday_str = RegExpUtility.get_group(match, 'weekday') weekday = self.config.day_of_week.get(weekday_str) value = DateUtils.this(reference, weekday) if weekday == 0: weekday = 7 if weekday < reference.isoweekday(): value = DateUtils.next(reference, weekday) result.timex = 'XXXX-WXX-' + str(weekday) future_date = value past_date = value if future_date < reference: future_date += timedelta(weeks=1) if past_date >= reference: past_date -= timedelta(weeks=1) result.future_value = future_date result.past_value = past_date result.success = True return result return result
def should_skip_from_merge(self, source: ExtractResult) -> bool: return regex.search(self.config.from_to_regex, source.text)
def alignSequences(targetsite_sequence, window_sequence, max_score=7): window_sequence = window_sequence.upper() query_regex_standard, query_regex_gap = regexFromSequence( targetsite_sequence, errors=max_score) # Try both strands alignments_mm, alignments_bulge = list(), list() alignments_mm.append(('+', 'standard', regex.search(query_regex_standard, window_sequence, regex.BESTMATCH))) alignments_mm.append(('-', 'standard', regex.search(query_regex_standard, reverseComplement(window_sequence), regex.BESTMATCH))) alignments_bulge.append(('+', 'gapped', regex.search(query_regex_gap, window_sequence, regex.BESTMATCH))) alignments_bulge.append(('-', 'gapped', regex.search(query_regex_gap, reverseComplement(window_sequence), regex.BESTMATCH))) lowest_distance_score, lowest_mismatch = 100, max_score + 1 chosen_alignment_b, chosen_alignment_m, chosen_alignment_strand_b, chosen_alignment_strand_m = None, None, '', '' # Use regex to find the best match allowing only for mismatches for aln_m in alignments_mm: strand_m, alignment_type_m, match_m = aln_m if match_m != None: mismatches, insertions, deletions = match_m.fuzzy_counts if mismatches < lowest_mismatch: chosen_alignment_m = match_m chosen_alignment_strand_m = strand_m lowest_mismatch = mismatches # Use regex to find the best match allowing for gaps, so that its edit distance is strictly lower than the # total number of mismatches of the sequence founded (if any) allowing only for mismatches. for aln_b in alignments_bulge: strand_b, alignment_type_b, match_b = aln_b if match_b != None: substitutions, insertions, deletions = match_b.fuzzy_counts if insertions or deletions: distance_score = substitutions + (insertions + deletions) * 3 edistance = substitutions + insertions + deletions if distance_score < lowest_distance_score and edistance < lowest_mismatch: chosen_alignment_b = match_b chosen_alignment_strand_b = strand_b lowest_distance_score = distance_score if chosen_alignment_m: offtarget_sequence_no_bulge = chosen_alignment_m.group() mismatches = chosen_alignment_m.fuzzy_counts[0] start_no_bulge = chosen_alignment_m.start() end_no_bulge = chosen_alignment_m.end() else: offtarget_sequence_no_bulge, mismatches, start_no_bulge, end_no_bulge, chosen_alignment_strand_m = '', '', '', '', '' bulged_offtarget_sequence, score, length, substitutions, insertions, deletions, bulged_start, bulged_end, realigned_target = \ '', '', '', '', '', '', '', '', 'none' if chosen_alignment_b: realigned_target, bulged_offtarget_sequence = realignedSequences( targetsite_sequence, chosen_alignment_b, max_score) if bulged_offtarget_sequence: length = len(chosen_alignment_b.group()) substitutions, insertions, deletions = chosen_alignment_b.fuzzy_counts score = substitutions + (insertions + deletions) * 3 bulged_start = chosen_alignment_b.start() bulged_end = chosen_alignment_b.end() else: chosen_alignment_strand_b = '' return [ offtarget_sequence_no_bulge, mismatches, len(offtarget_sequence_no_bulge), chosen_alignment_strand_m, start_no_bulge, end_no_bulge, realigned_target, bulged_offtarget_sequence, length, score, substitutions, insertions, deletions, chosen_alignment_strand_b, bulged_start, bulged_end ]
def find_date_separator(format): m = re.search(r'(?:(?:%[dbBmaA])(\W))+', format) if m: return m.group(1)
def is_group_price(self, string): string = string.strip() rex = r'\dF\d\d?\d?' if regex.search(rex, string): return True return False
def Run(self): start_time = time.time() logger.info("Process started: input_path={}, output_path={}".format( self.input_path, self.output_path)) mem_index = tkrzw.DBM() mem_index.Open("", True, dbm="BabyDBM").OrDie() input_dbm = tkrzw.DBM() input_dbm.Open(self.input_path, False, dbm="HashDBM").OrDie() it = input_dbm.MakeIterator() it.First() num_entries = 0 num_translations = 0 while True: record = it.GetStr() if not record: break key, serialized = record entry = json.loads(serialized) for word_entry in entry: prob = max(float(word_entry.get("probability") or "0"), 0.0000001) aoa = min(float(word_entry.get("aoa") or "20"), 20.0) score = prob * ((30 - aoa) / 10) word_trans = word_entry.get("translation") or [] dup_word_trans = word_trans for word_tran in word_trans: match = regex.search( r"([\p{Han}\p{Katakana}ー]{2,})(する|すること|される|されること|をする)$", word_tran) if match: short_word_tran = word_tran[:-len(match.group(2))] if short_word_tran: dup_word_trans.append(short_word_tran) short_word_tran = self.tokenizer.CutJaWordNounParticle( word_tran) if short_word_tran != word_tran: dup_word_trans.append(short_word_tran) match = regex.search( r"([\p{Han}\p{Katakana}ー]{2,})(的|的な|的に)$", word_tran) if match: short_word_tran = word_tran[:-len(match.group(2))] if short_word_tran: dup_word_trans.append(short_word_tran) match = regex.search( r"([\p{Han}]{2,})(が|の|を|に|へ|と|より|から|で|や|な|なる|たる)$", word_tran) if match: short_word_tran = word_tran[:-len(match.group(2))] if short_word_tran: dup_word_trans.append(short_word_tran) uniq_trans = set() for tran in dup_word_trans: norm_tran = tkrzw_dict.NormalizeWord(tran) if norm_tran in uniq_trans: continue uniq_trans.add(norm_tran) pair = "{}\t{:.8f}".format(key, score) score *= 0.98 mem_index.Append(norm_tran, pair, "\t").OrDie() num_translations += len(uniq_trans) num_entries += 1 if num_entries % 10000 == 0: logger.info("Reading: entries={}, translations={}".format( num_entries, num_translations)) it.Next() input_dbm.Close().OrDie() logger.info("Reading done: entries={}, translations={}".format( num_entries, num_translations)) output_dbm = tkrzw.DBM() num_buckets = mem_index.Count() * 2 output_dbm.Open(self.output_path, True, dbm="HashDBM", truncate=True, align_pow=0, num_buckets=num_buckets).OrDie() tran_prob_dbm = None if self.tran_prob_path: tran_prob_dbm = tkrzw.DBM() tran_prob_dbm.Open(self.tran_prob_path, False, dbm="HashDBM").OrDie() it = mem_index.MakeIterator() it.First() num_records = 0 while True: record = it.GetStr() if not record: break key, value = record scored_trans = [] uniq_words = set() fields = value.split("\t") for i in range(0, len(fields), 2): word = fields[i] score = float(fields[i + 1]) if word in uniq_words: continue uniq_words.add(word) if tran_prob_dbm: prob = self.GetTranProb(tran_prob_dbm, word, key) score = (score * max(prob, 0.000001))**0.5 scored_trans.append((word, score)) scored_trans = sorted(scored_trans, key=lambda x: x[1], reverse=True) value = "\t".join([x[0] for x in scored_trans]) output_dbm.Set(key, value).OrDie() num_records += 1 if num_records % 10000 == 0: logger.info("Writing: records={}".format(num_records)) it.Next() if tran_prob_dbm: tran_prob_dbm.Close().OrDie() output_dbm.Close().OrDie() logger.info("Writing done: records={}".format(num_records)) mem_index.Close().OrDie() logger.info("Process done: elapsed_time={:.2f}s".format(time.time() - start_time))
for item in p: # print(type(item), item.name, item.get('class'), item.get('id')) # print(item.text) if item.name == 'dl': maths = item.findAll('math') for math in maths: text += math.get('alttext')[15:-1] + '\n' else: math_elements = item.findAll(class_='mwe-math-element') if math_elements: matches = regex.findall(' \\n\\n[−\\/\\(\\)\\|\\=\\+\\-A-Z0-9a-z\s\\p{Greek}]+{\\\displaystyle .+}', item.text) temp2 = '' from_counter = 0 for match in matches: temp = item.text[from_counter:item.text.find(match)] temp += ' ' + regex.search('{\\\displaystyle .+}', match)[0][15:-1] from_counter = (item.text.find(match) + len(match) + 2) temp2 += temp temp2 += item.text[from_counter:] text += temp2 else: text += item.text print('Downloading images...') images = html_soup.findAll('img', {'src':regex.compile('[.jpg|.png]')}) image_url_list = [] for image in images: if image.get('src').startswith('/static'): images.remove(image) elif 'https:' not in image.get('src'): image_url_list.append('https:' + image.get('src'))
def rok_zgonu(x): try: return int( re.search('(?<=\- ca |\-ca |\-ok\. |\-|po )(\d+)', x).group(0)) except (TypeError, AttributeError): return None
# The function match() only compares the # first word in a string if regex.match(pattern1, reg_string1): print("You found a match!") else: print("Bummer. No match.") if regex.match(pattern1, reg_string2): print("You found a match!") else: print("Bummer. No match.") #================================= # Second example, using search() #================================= if regex.search(pattern1, reg_string1): print("You found a match!") else: print("Bummer. No match.") if regex.search(pattern1, reg_string2): print("You found a match!") else: print("Bummer. No match.") #=========================================== # Comparing functions match() and search() #=========================================== matches1 = regex.match(pattern1, reg_string1) matches2 = regex.match(pattern1, reg_string2) matches3 = regex.search(pattern1, reg_string1)
#! /usr/bin/env python # Fix Python source files to avoid using # def method(self, (arg1, ..., argn)): # instead of the more rational # def method(self, arg1, ..., argn): # # Command line arguments are files or directories to be processed. # Directories are searched recursively for files whose name looks # like a python module. # Symbolic links are always ignored (except as explicit directory # arguments). Of course, the original file is kept as a back-up # (with a "~" attached to its name). # It complains about binaries (files containing null bytes) # and about files that are ostensibly not Python files: if the first # line starts with '#!' and does not contain the string 'python'. # # Changes made are reported to stdout in a diff-like format. # # Undoubtedly you can do this using find and sed or perl, but this is # a nice example of Python code that recurses down a directory tree # and uses regular expressions. Also note several subtleties like # preserving the file's mode and avoiding to even write a temp file # when no changes are needed for a file. # # NB: by changing only the function fixline() you can turn this # into a program for a different change to Python programs... import sys import regex import os from stat import *
def _frac_like_number_parse(self, ext_result: ExtractResult) -> ParseResult: result = ParseResult() result.start = ext_result.start result.length = ext_result.length result.text = ext_result.text result.type = ext_result.type result_text = ext_result.text.lower() if regex.search(self.config.fraction_marker_token, result_text): over_index = result_text.find(self.config.fraction_marker_token) small_part = result_text[0:over_index].strip() big_part = result_text[over_index + len(self.config.fraction_marker_token ):len(result_text)].strip() small_value = self._get_digital_value( small_part, 1) if self._is_digit( small_part[0]) else self.__get_int_value( self.__get_matches(small_part)) big_value = self._get_digital_value(big_part, 1) if self._is_digit( big_part[0]) else self.__get_int_value( self.__get_matches(big_part)) result.value = small_value / big_value else: words = list(filter(lambda x: x, result_text.split(' '))) frac_words = self.config.normalize_token_set(words, result) # Split fraction with integer split_index = len(frac_words) - 1 current_value = self.config.resolve_composite_number( frac_words[split_index]) round_value = 1 for split_index in range(len(frac_words) - 2, -1, -1): if (frac_words[split_index] in self.config.written_fraction_separator_texts or frac_words[split_index] in self.config.written_integer_separator_texts): continue previous_value = current_value current_value = self.config.resolve_composite_number( frac_words[split_index]) sm_hundreds = 100 # previous: hundred # current: one if ((previous_value >= sm_hundreds and previous_value > current_value) or (previous_value < sm_hundreds and self.__is_composable(current_value, previous_value))): if (previous_value < sm_hundreds and current_value >= round_value): round_value = current_value elif (previous_value < sm_hundreds and current_value < round_value): split_index += 1 break # current is the first word if split_index == 0: # scan, skip the first word split_index = 1 while split_index <= len(frac_words) - 2: # e.g. one hundred thousand # frac[i+1] % 100 and frac[i] % 100 = 0 if (self.config.resolve_composite_number( frac_words[split_index]) >= sm_hundreds and not frac_words[split_index + 1] in self .config.written_fraction_separator_texts and self.config.resolve_composite_number( frac_words[split_index + 1]) < sm_hundreds): split_index += 1 break split_index += 1 break continue split_index += 1 break frac_part = [] for i in range(split_index, len(frac_words)): if frac_words[i].find('-') > -1: split = frac_words[i].split('-') frac_part.append(split[0]) frac_part.append('-') frac_part.append(split[1]) else: frac_part.append(frac_words[i]) frac_words = frac_words[:split_index] # denomi = denominator denomi_value = self.__get_int_value(frac_part) # Split mixed number with fraction numer_value = 0 int_value = 0 mixed_index = len(frac_words) for i in range(len(frac_words) - 1, -1, -1): if (i < len(frac_words) - 1 and frac_words[i] in self.config.written_fraction_separator_texts): numer_str = ' '.join(frac_words[i + 1:len(frac_words)]) numer_value = self.__get_int_value( self.__get_matches(numer_str)) mixed_index = i + 1 break int_str = ' '.join(frac_words[0:mixed_index]) int_value = self.__get_int_value(self.__get_matches(int_str)) # Find mixed number if (mixed_index != len(frac_words) and numer_value < denomi_value): # int_value + numer_value / denomi_value result.value = int_value + numer_value / denomi_value else: # (int_value + numer_value) / denomi_value result.value = (int_value + numer_value) / denomi_value # Convert to float for fixed float point vs. exponential notation consistency /w C#/TS/JS result.value = float(result.value) return result
def AppendTranslations(self, wnjpn_trans, feedback_trans, aux_trans, subaux_trans, tran_thes, synset_index, tran_index): start_time = time.time() logger.info("Appending translations: input_path={}, output_path={}".format( self.input_path, self.output_path)) input_dbm = tkrzw.DBM() input_dbm.Open(self.input_path, False, dbm="HashDBM").OrDie() phrase_prob_dbm = None if self.phrase_prob_path: phrase_prob_dbm = tkrzw.DBM() phrase_prob_dbm.Open(self.phrase_prob_path, False, dbm="HashDBM").OrDie() rev_prob_dbm = None if self.rev_prob_path: rev_prob_dbm = tkrzw.DBM() rev_prob_dbm.Open(self.rev_prob_path, False, dbm="HashDBM").OrDie() tokenizer = tkrzw_tokenizer.Tokenizer() tran_prob_dbm =None if self.tran_prob_path: tran_prob_dbm = tkrzw.DBM() tran_prob_dbm.Open(self.tran_prob_path, False, dbm="HashDBM").OrDie() output_dbm = tkrzw.DBM() num_buckets = input_dbm.Count() * 2 output_dbm.Open( self.output_path, True, dbm="HashDBM", truncate=True, align_pow=0, num_buckets=num_buckets).OrDie() num_words = 0 num_orig_trans = 0 num_match_trans = 0 num_voted_trans = 0 num_borrowed_trans = 0 num_items = 0 num_items_bare = 0 num_items_rescued = 0 it = input_dbm.MakeIterator() it.First() while True: record = it.GetStr() if not record: break key, serialized = record entry = json.loads(serialized) items = entry["item"] spell_ratios = {} for item in items: word = item["word"] phrase_prob = float(item.get("prob") or 0.0) spell_ratios[word] = phrase_prob + 0.00000001 sum_prob = 0.0 for word, prob in spell_ratios.items(): sum_prob += prob for word, prob in list(spell_ratios.items()): spell_ratios[word] = prob / sum_prob all_tran_probs = tran_index.get(word) or {} for item in items: attrs = ["translation", "synonym", "antonym", "hypernym", "hyponym", "similar", "derivative"] for attr in attrs: rel_words = item.get(attr) if rel_words: rel_words = self.SortRelatedWords( rel_words, all_tran_probs, tokenizer, phrase_prob_dbm, tran_prob_dbm, synset_index, tran_index) item[attr] = rel_words for item in items: word = item["word"] pos = item["pos"] synset = item["synset"] links = item.get("link") or {} phrase_prob = float(item.get("prob") or 0.0) spell_ratio = spell_ratios[word] synonyms = item.get("synonym") or [] hypernyms = item.get("hypernym") or [] hyponyms = item.get("hyponym") or [] similars = item.get("similar") or [] derivatives = item.get("derivative") or [] synonym_ids = links.get("synonym") or [] hypernym_ids = links.get("hypernym") or [] hyponym_ids = links.get("hyponym") or [] similar_ids = links.get("similar") or [] derivative_ids = links.get("derivative") or [] item_tran_pairs = wnjpn_trans.get(synset) or [] item_aux_trans = list(aux_trans.get(word) or []) ext_item_aux_trans = list(item_aux_trans) ext_item_aux_trans.extend(subaux_trans.get(word) or []) self.NormalizeTranslationList(tokenizer, pos, item_aux_trans) self.NormalizeTranslationList(tokenizer, pos, ext_item_aux_trans) scored_item_trans = collections.defaultdict(float) hand_trans = set() for tran, src in item_tran_pairs: if src == "mono": hit = False for item_aux_tran in ext_item_aux_trans: dist = tkrzw.Utility.EditDistanceLev(tran, item_aux_tran) dist_ratio = dist / max(len(tran), len(item_aux_tran)) if dist < 0.3: hit = True if not hit: continue tran = tokenizer.NormalizeJaWordForPos(pos, tran) scored_item_trans[tran] = 1.0 if src == "hand": hand_trans.add(tran) if feedback_trans: item_fb_trans = feedback_trans.get(word + ":" + synset) or [] if item_fb_trans: for tran in item_fb_trans: tran = tokenizer.NormalizeJaWordForPos(pos, tran) if tran not in scored_item_trans: scored_item_trans[tran] = 0.9 for tran, score in list(scored_item_trans.items()): if score != 1.0: continue cmp_words = tran_thes.get(tran) if cmp_words: for cmp_word in cmp_words: if cmp_word not in scored_item_trans: scored_item_trans[cmp_word] = 0.5 num_items += 1 bare = not scored_item_trans if bare: num_items_bare += 1 num_orig_trans += len(scored_item_trans) syno_tran_counts = collections.defaultdict(int) hyper_tran_counts = collections.defaultdict(int) hypo_tran_counts = collections.defaultdict(int) similar_tran_counts = collections.defaultdict(int) derivative_tran_counts = collections.defaultdict(int) aux_trans_set = set(ext_item_aux_trans) checked_words = set() checked_ids = set([synset]) adopted_rel_trans = set() voted_rel_words = set() voted_rel_records = set() for rel_words, rel_ids, tran_counts in ( (synonyms, synonym_ids, syno_tran_counts), (hypernyms, hypernym_ids, hyper_tran_counts), (hyponyms, hyponym_ids, hypo_tran_counts), (similars, similar_ids, similar_tran_counts), (derivatives, derivative_ids, derivative_tran_counts)): for rel_word in rel_words: is_similar = self.AreSimilarWords(rel_word, word) rel_phrase_prob = 0.0 if phrase_prob_dbm: rel_phrase_prob = self.GetPhraseProb(phrase_prob_dbm, tokenizer, "en", rel_word) mean_prob = (phrase_prob * rel_phrase_prob) ** 0.5 rel_aux_trans = [] if rel_word not in checked_words: checked_words.add(rel_word) tmp_aux_trans = aux_trans.get(rel_word) if tmp_aux_trans: rel_aux_trans.extend(tmp_aux_trans) for rel_id in synset_index[rel_word]: if rel_id not in rel_ids: continue if rel_id not in checked_ids: checked_ids.add(rel_id) tmp_aux_trans = wnjpn_trans.get(rel_id) if tmp_aux_trans: tmp_aux_trans = [x[0] for x in tmp_aux_trans] rel_aux_trans.extend(tmp_aux_trans) if rel_aux_trans: self.NormalizeTranslationList(tokenizer, pos, rel_aux_trans) if not is_similar and mean_prob < 0.0005: for item_aux_tran in ext_item_aux_trans: if regex.fullmatch(r"[\p{Hiragana}]{,3}", item_aux_tran): continue if item_aux_tran in rel_aux_trans: if self.IsValidPosTran(tokenizer, pos, item_aux_tran): adopted_rel_trans.add(item_aux_tran) if mean_prob < 0.005: voted_top = rel_word for voted_rel_word in voted_rel_words: if self.AreSimilarWords(rel_word, voted_rel_word): voted_top = voted_rel_word break voted_rel_words.add(rel_word) for rel_aux_tran in set(rel_aux_trans): voted_record = (voted_top, rel_aux_tran) if voted_record in voted_rel_records: continue voted_rel_records.add(voted_record) tran_counts[rel_aux_tran] += 1 for rel_tran in adopted_rel_trans: scored_item_trans[rel_tran] = max(0.8, scored_item_trans[rel_tran] + 0.25) num_match_trans += 1 if bare: for deri_tran, count in derivative_tran_counts.items(): syno_tran_counts[deri_tran] = syno_tran_counts[deri_tran] + count derivative_tran_counts.clear() adopted_syno_trans = set() for syno_tran, count in syno_tran_counts.items(): if regex.fullmatch(r"[\p{Hiragana}]{,3}", syno_tran): continue if syno_tran in hyper_tran_counts: count += 1 if syno_tran in hypo_tran_counts: count += 1 if syno_tran in similar_tran_counts: count += 1 if syno_tran in derivative_tran_counts: count += 1 if syno_tran in aux_trans_set: count += 1 if count >= 3 and self.IsValidPosTran(tokenizer, pos, syno_tran): adopted_syno_trans.add(syno_tran) for syno_tran in adopted_syno_trans: scored_item_trans[syno_tran] = max(0.8, scored_item_trans[syno_tran] + 0.25) num_voted_trans += 1 if item_aux_trans: aux_scores = {} for syno_tran, count in syno_tran_counts.items(): if count < math.ceil(len(synonyms) * 2 / 3): continue if len(syno_tran) < 2: continue if not regex.search(r"\p{Han}[\p{Han}\p{Hiragana}]", syno_tran): continue for aux_tran in item_aux_trans: if aux_tran.find(syno_tran) >= 0 and self.IsValidPosTran(tokenizer, pos, aux_tran): weight = 0.25 if aux_tran == syno_tran else 0.2 aux_scores[aux_tran] = max(aux_scores.get(aux_tran) or 0.0, weight) for hyper_tran, count in hyper_tran_counts.items(): if count < math.ceil(len(hypernyms) * 2 / 3): continue if len(hyper_tran) < 2: continue if not regex.search(r"\p{Han}[\p{Han}\p{Hiragana}]", hyper_tran): continue for aux_tran in item_aux_trans: if aux_tran.find(hyper_tran) >= 0 and self.IsValidPosTran(tokenizer, pos, aux_tran): weight = 0.25 if aux_tran == hyper_tran else 0.2 aux_scores[aux_tran] = max(aux_scores.get(aux_tran) or 0.0, weight) for aux_tran, score in aux_scores.items(): scored_item_trans[aux_tran] = scored_item_trans[aux_tran] + score num_borrowed_trans += 1 item_score = 0.0 if scored_item_trans: scored_item_trans = scored_item_trans.items() if bare: num_items_rescued += 1 if rev_prob_dbm or tran_prob_dbm: sorted_item_trans, item_score, tran_scores = (self.SortWordsByScore( word, pos, scored_item_trans, hand_trans, rev_prob_dbm, tokenizer, tran_prob_dbm)) else: scored_item_trans = sorted(scored_item_trans, key=lambda x: x[1], reverse=True) sorted_item_trans = [x[0] for x in scored_item_trans] final_item_trans = [] uniq_item_trans = set() for tran in sorted_item_trans: tran = regex.sub(r"^を.*", "", tran) tran = regex.sub(r"・", "", tran) if not tran or tran in uniq_item_trans: continue uniq_item_trans.add(tran) final_item_trans.append(tran) item["translation"] = final_item_trans[:MAX_TRANSLATIONS_PER_WORD] if tran_scores: tran_score_map = {} for tran, tran_score in tran_scores[:MAX_TRANSLATIONS_PER_WORD]: tran = regex.sub(r"^を.*", "", tran) tran = regex.sub(r"・", "", tran) if tran and tran not in tran_score_map: tran_score_map[tran] = "{:.6f}".format(tran_score).replace("0.", ".") item["translation_score"] = tran_score_map item_score += spell_ratio * 0.5 item["score"] = "{:.8f}".format(item_score).replace("0.", ".") if "link" in item: del item["link"] if rev_prob_dbm: entry["item"] = sorted( items, key=lambda item: float(item.get("score") or 0.0), reverse=True) serialized = json.dumps(entry, separators=(",", ":"), ensure_ascii=False) output_dbm.Set(key, serialized).OrDie() num_words += 1 if num_words % 1000 == 0: logger.info("Saving words: words={}".format(num_words)) it.Next() output_dbm.Close().OrDie() if tran_prob_dbm: tran_prob_dbm.Close().OrDie() if rev_prob_dbm: rev_prob_dbm.Close().OrDie() if phrase_prob_dbm: phrase_prob_dbm.Close().OrDie() input_dbm.Close().OrDie() logger.info( "Aappending translations done: words={}, elapsed_time={:.2f}s".format( num_words, time.time() - start_time)) logger.info(("Stats: orig={}, match={}, voted={}, borrowed={}" + ", items={}, bare={}, rescued={}").format( num_orig_trans, num_match_trans, num_voted_trans, num_borrowed_trans, num_items, num_items_bare, num_items_rescued))
def check_if_part_of_date(self, text_body): rex = r'^(\d{4}|\d{2})(-|-\d{2})?(-|-\d{2})?$' if regex.search(rex, text_body): return True return False
def main(img): osd = pytesseract.image_to_osd(img) angle = re.search("(?<=Rotate: )\d+", osd).group(0) script = re.search("(?<=Script: )\d+", osd).group(0) print("angle: ", angle) print("script: ", script)
def _eval(self, context: RuleContext) -> Optional[LintResult]: """Do not use special characters in object names.""" # Config type hints self.quoted_identifiers_policy: str self.unquoted_identifiers_policy: str self.allow_space_in_identifier: bool self.additional_allowed_characters: str self.ignore_words: str self.ignore_words_regex: str # Confirm it's a single identifier. assert context.segment.is_type("naked_identifier", "quoted_identifier") # Get the ignore_words_list configuration. try: ignore_words_list = self.ignore_words_list except AttributeError: # First-time only, read the settings from configuration. This is # very slow. ignore_words_list = self._init_ignore_words_list() # Assume unquoted (we'll update if quoted) policy = self.unquoted_identifiers_policy identifier = context.segment.raw # Skip if in ignore list if ignore_words_list and identifier.lower() in ignore_words_list: return None # Skip if matches ignore regex if self.ignore_words_regex and regex.search( self.ignore_words_regex, identifier ): return LintResult(memory=context.memory) # Do some extra processing for quoted identifiers. if context.segment.is_type("quoted_identifier"): # Update the default policy to quoted policy = self.quoted_identifiers_policy # Strip the quotes first identifier = context.segment.raw[1:-1] # Skip if in ignore list - repeat check now we've strip the quotes if ignore_words_list and identifier.lower() in ignore_words_list: return None # Skip if matches ignore regex - repeat check now we've strip the quotes if self.ignore_words_regex and regex.search( self.ignore_words_regex, identifier ): return LintResult(memory=context.memory) # BigQuery table references are quoted in back ticks so allow dots # # It also allows a star at the end of table_references for wildcards # (https://cloud.google.com/bigquery/docs/querying-wildcard-tables) # # Strip both out before testing the identifier if ( context.dialect.name in ["bigquery"] and context.parent_stack and context.parent_stack[-1].is_type("table_reference") ): if identifier[-1] == "*": identifier = identifier[:-1] identifier = identifier.replace(".", "") # SparkSQL file references for direct file query # are quoted in back ticks to allow for identfiers common # in file paths and regex patterns for path globbing # https://spark.apache.org/docs/latest/sql-ref-syntax-qry-select-file.html # # Path Glob Filters (done inline for SQL direct file query) # https://spark.apache.org/docs/latest/sql-data-sources-generic-options.html#path-global-filter # if context.dialect.name in ["sparksql"] and context.parent_stack: # SparkSQL file references for direct file query # are quoted in back ticks to allow for identfiers common # in file paths and regex patterns for path globbing # https://spark.apache.org/docs/latest/sql-ref-syntax-qry-select-file.html # # Path Glob Filters (done inline for SQL direct file query) # https://spark.apache.org/docs/latest/sql-data-sources-generic-options.html#path-global-filter # if context.parent_stack[-1].is_type("file_reference"): return None # SparkSQL properties keys used for setting table and runtime # configurations denote namespace using dots, so these are # removed before testing L057 to not trigger false positives # Runtime configurations: # https://spark.apache.org/docs/latest/configuration.html#application-properties # Example configurations for table: # https://spark.apache.org/docs/latest/sql-data-sources-parquet.html#configuration # if context.parent_stack[-1].is_type("property_name_identifier"): identifier = identifier.replace(".", "") # Strip spaces if allowed (note a separate config as only valid for quoted # identifiers) if self.allow_space_in_identifier: identifier = identifier.replace(" ", "") # We always allow underscores so strip them out identifier = identifier.replace("_", "") # redshift allows a # at the beginning of temporary table names if ( context.dialect.name == "redshift" and identifier[0] == "#" and context.parent_stack and context.parent_stack[-1].is_type("table_reference") ): identifier = identifier[1:] # Set the identified minus the allowed characters additional_allowed_characters = self._get_additional_allowed_characters( context.dialect.name ) if additional_allowed_characters: identifier = identifier.translate( str.maketrans("", "", additional_allowed_characters) ) # Finally test if the remaining identifier is only made up of alphanumerics if identifiers_policy_applicable(policy, context.parent_stack) and not ( identifier.isalnum() ): return LintResult(anchor=context.segment) return None
def PrepareSection(self, clusters, num_sections, has_next, uniq_words, body_dbm, phrase_dbm, vetted_words, out_words, out_children): section_main_words = [] section_extra_word_lists = [] for cluster in clusters: main_words = [] skipped_words = [] dedup_words = [] aliases = collections.defaultdict(set) num_words = len(cluster[0]) local_uniq_words = set() for word in cluster[0]: data = body_dbm.GetStr(word) if not data: continue is_dup = word in uniq_words or word in local_uniq_words entries = json.loads(data) for entry in entries: if entry["word"] != word: continue if is_dup: is_verb = False for item in entry["item"]: if item["label"] == "wn" and item["pos"] == "verb": is_verb = True has_verb_inflection = entry.get("verb_present_participle") if not is_verb or not has_verb_inflection: continue parents = entry.get("parent") children = entry.get("child") for derivatives in (parents, children): if derivatives: for derivative in derivatives: local_uniq_words.add(derivative) trans = entry.get("translation") if not trans: continue has_good_tran = False for tran in trans[:6]: if regex.search(r"[\p{Han}\p{Hiragana}]", tran): has_good_tran = True break if not has_good_tran: skipped_words.append(word) continue count_synonyms = collections.defaultdict(int) num_items = 0 for item in entry["item"]: if item["label"] == "wn": num_items += 1 for part in item["text"].split("[-]"): part = part.strip() match = regex.search(r"\[synonym\]: (.*)", part) if match: for synonym in match.group(1).split(","): synonym = synonym.strip() if synonym: count_synonyms[synonym] += 1 synonyms = set() for synonym, count in count_synonyms.items(): if count >= num_items: synonyms.add(synonym) if synonyms: dedup_words.append((word, synonyms)) duplicated = False for dedup_word, dedup_synonyms in dedup_words: if word[0] == dedup_word[0]: dist = tkrzw.Utility.EditDistanceLev(word, dedup_word) if dist <= 1 and (word in dedup_synonyms or dedup_word in synonyms): aliases[dedup_word].add(word) duplicated = True if duplicated: skipped_words.append(word) continue main_words.append(word) break extra_words = [] for extra_word in cluster[1]: if extra_word not in vetted_words: extra_words.append(extra_word) while len(main_words) < num_words and extra_words: main_words.append(extra_words[0]) extra_words = extra_words[1:] for skipped_word in skipped_words: extra_words.insert(0, skipped_word) for word in main_words: surfaces = [word] surfaces.extend(aliases.get(word) or []) main_surface = "" other_surfaces = [] if len(surfaces) == 1: main_surface = surfaces[0] else: prob_surfaces = [] for surface in surfaces: prob = float(phrase_dbm.GetStr(surface) or "0") prob_surfaces.append((surface, prob)) prob_surfaces = sorted(prob_surfaces, key=lambda x: x[1], reverse=True) main_surface = prob_surfaces[0][0] other_surfaces = [x[0] for x in prob_surfaces[1:]] section_main_words.append((main_surface, other_surfaces)) section_extra_word_lists.append(extra_words) for main_word in section_main_words: out_words.append(main_word[0]) out_path = os.path.join(self.output_path, "study-{:03d}.xhtml".format(num_sections)) logger.info("Creating: {}".format(out_path)) with open(out_path, "w") as out_file: self.OutputStudy(out_file, num_sections, has_next, section_main_words, section_extra_word_lists, body_dbm, uniq_words, out_children) out_path = os.path.join(self.output_path, "check-{:03d}.xhtml".format(num_sections)) logger.info("Creating: {}".format(out_path)) with open(out_path, "w") as out_file: self.OutputCheck(out_file, num_sections, has_next, out_words, out_children, body_dbm)
def getCoverageProblems(self): """Verify that each target, rule and exclusion has the right number of tests that applies to it. Also check that each target has the right number of tests. In particular left-wildcard targets should have at least three tests. Right-wildcard targets should have at least ten tests. Returns an array of strings reporting any coverage problems if they exist, or empty list if coverage is sufficient. """ self._determineTestApplication() problems = [] # First, check each target has the right number of tests myTestTargets = [] # Only take tests which are not excluded into account for test in self.tests: if not self.excludes(test.url): urlParts = urlparse(test.url) hostname = urlParts.hostname myTestTargets.append(hostname) for target in self.targets: actual_count = 0 needed_count = 1 if target.startswith("*."): needed_count = 3 if target.endswith(".*"): needed_count = 10 # non-wildcard target always have a implicit test url, if is it not excluded if not "*" in target and not self.excludes( ("http://{}/".format(target))): continue # According to the logic in rules.js available at # EFForg/https-everywhere/blob/07fe9bd51456cc963c2d99e327f3183e032374ee/chromium/rules.js#L404 # pattern = target.replace('.', '\.') # .replace('*', '.+') # `*.example.com` matches `bar.example.com` and `foo.bar.example.com` etc. if pattern[0] == '*': pattern = pattern.replace('*', '.+') # however, `example.*` match `example.com` but not `example.co.uk` if pattern[-1] == '*': pattern = pattern.replace('*', '[^\.]+') # `www.*.example.com` match `www.image.example.com` but not `www.ssl.image.example.com` pattern = pattern.replace('*', '[^\.]+') pattern = '^' + pattern + '$' for test in myTestTargets: if regex.search(pattern, test) is not None: actual_count += 1 if not actual_count < needed_count: break if actual_count < needed_count: problems.append( "{}: Not enough tests ({} vs {}) for {}".format( self.filename, actual_count, needed_count, target)) # Next, make sure each rule or exclusion has sufficient tests. for rule in self.rules: needed_count = 1 + len(regex.findall("[+*?|]", rule.fromPattern)) # Don't treat the question mark in non-capturing and lookahead groups as increasing the # number of required tests. needed_count = needed_count - \ len(regex.findall("\(\?:", rule.fromPattern)) needed_count = needed_count - \ len(regex.findall("\(\?!", rule.fromPattern)) needed_count = needed_count - \ len(regex.findall("\(\?=", rule.fromPattern)) # Don't treat escaped questions marks as increasing the number of required # tests. needed_count = needed_count - \ len(regex.findall("\\?", rule.fromPattern)) actual_count = len(rule.tests) if actual_count < needed_count: problems.append( "{}: Not enough tests ({} vs {}) for {}".format( self.filename, actual_count, needed_count, rule)) pass for exclusion in self.exclusions: needed_count = 1 + \ len(regex.findall("[+*?|]", exclusion.exclusionPattern)) needed_count = needed_count - \ len(regex.findall("\(\?:", exclusion.exclusionPattern)) needed_count = needed_count - \ len(regex.findall("\\?", rule.fromPattern)) actual_count = len(exclusion.tests) if actual_count < needed_count: problems.append( "{}: Not enough tests ({} vs {}) for {}".format( self.filename, actual_count, needed_count, exclusion)) return problems
def web_tech(url, no_cache=False, verbose=False): if verbose: logging.basicConfig(level=logging.INFO, format='%(message)s') if not no_cache: homedir = Path(os.path.expanduser('~')) requests_cache.install_cache(str(homedir / '.habu_requests_cache'), expire_after=3600) try: r = requests.get(url) except Exception as e: logging.error(e) return False with (DATADIR / 'apps-habu.json').open() as f: data = json.load(f) apps = data['apps'] categories = data['categories'] content = r.text soup = BeautifulSoup(content, "lxml") tech = {} for app in apps: version_group = False for header in apps[app].get('headers', []): if header in r.headers: header_regex = apps[app]['headers'][header].split('\;')[0] if '\;version:\\' in apps[app]['headers'][header]: version_group = apps[app]['headers'][header].split( '\;version:\\')[1] match = re.search(header_regex, r.headers[header], flags=re.IGNORECASE) if match or not header_regex: logging.info( "{app} detected by {header} HTTP header = {header_content}" .format(app=app, header=header, header_content=r.headers[header])) if app not in tech: tech[app] = apps[app] if version_group and version_group.isdigit(): try: version = match.group(int(version_group)) if version: tech[app]['version'] = version logging.info( "The version detected is {version}".format( version=version)) except IndexError: pass for key in ['script', 'html']: version_group = False for item in apps[app].get(key, []): item_regex = item.split('\;')[0] if '\;version:\\' in item: version_group = item.split('\;version:\\')[1] match = re.search(item_regex, r.text, flags=re.IGNORECASE & re.MULTILINE) if match: logging.info( "{app} detected by HTML body with regex {regex}". format(app=app, regex=item_regex)) if app not in tech: tech[app] = apps[app] if version_group and version_group.isdigit(): try: version = match.group(int(version_group)) if version: tech[app]['version'] = version logging.info( "The version detected is {version}".format( version=version)) except IndexError: pass for url_regex in apps[app].get('url', []): match = re.search(url_regex, url, flags=re.IGNORECASE & re.MULTILINE) if match: logging.info("{app} detected by URL with regex {regex}".format( app=app, regex=url_regex)) if app not in tech: tech[app] = apps[app] for cookie_name in apps[app].get('cookies', []): for cookie in r.cookies: if cookie_name == cookie.name: logging.info("{app} detected by cookie {cookie}".format( app=app, cookie=cookie.name)) if app not in tech: tech[app] = apps[app] for meta in apps[app].get('meta', []): version_group = False for tag in soup.find_all("meta", attrs={'name': meta}): meta_regex = apps[app]['meta'][meta] if '\;version:\\' in meta_regex: version_group = meta_regex.split('\;version:\\')[1] meta_regex = meta_regex.split('\;')[0] try: match = re.search(meta_regex, tag['content'], flags=re.IGNORECASE) except KeyError: continue if match: logging.info( "{app} detected by meta {meta} tag with regex {regex}". format(app=app, meta=meta, regex=meta_regex)) if app not in tech: tech[app] = apps[app] if version_group and version_group.isdigit(): try: version = match.group(int(version_group)) if version: tech[app]['version'] = version logging.info( "The version detected is {version}".format( version=version)) except IndexError: pass for t in list(tech.keys()): for imply in tech[t].get('implies', []): imply = imply.split('\\;')[0] if imply not in tech: logging.info("{imply} detected because implied by {t}".format( imply=imply, t=t)) tech[imply] = apps[imply] for t in list(tech.keys()): for exclude in tech[t].get('excludes', []): logging.info( "removing {exlude} because its excluded by {t}".format( exlude=exclude, t=t)) del (tech[t]) response = {} for t in sorted(tech): response[t] = {'categories': []} if 'version' in tech[t]: response[t]['version'] = tech[t]['version'] for category in tech[t]['cats']: response[t]['categories'].append(categories[str(category)]['name']) return response
def delete(title): if re.search(r'B5.5', title): return title return None
def parse_number_with_month(self, source: str, reference: datetime) -> DateTimeParseResult: trimmed_source = source.strip() ambiguous = True result = DateTimeResolutionResult() ers = self.config.ordinal_extractor.extract(trimmed_source) if not ers: ers = self.config.integer_extractor.extract(trimmed_source) if not ers: return result num = int(self.config.number_parser.parse(ers[0]).value) day = 1 month = 0 match = regex.search(self.config.month_regex, trimmed_source) if match: month = self.config.month_of_year.get(match.group()) day = num else: # handling relative month match = regex.search(self.config.relative_month_regex, trimmed_source) if match: month_str = match.group('order') swift = self.config.get_swift_month(month_str) date = reference.replace(month=reference.month + swift) month = date.month day = num ambiguous = False # handling casesd like 'second Sunday' if not match: match = regex.search(self.config.week_day_regex, trimmed_source) if match: month = reference.month # resolve the date of wanted week day wanted_week_day = self.config.day_of_week.get( match.group('weekday')) first_date = DateUtils.safe_create_from_min_value( reference.year, reference.month, 1) first_weekday = first_date.isoweekday() delta_days = wanted_week_day - first_weekday if wanted_week_day > first_weekday else wanted_week_day - first_weekday + 7 first_wanted_week_day = first_date + timedelta(days=delta_days) day = first_wanted_week_day.day + ((num - 1) * 7) ambiguous = False if not match: return result year = reference.year # for LUIS format value string date = DateUtils.safe_create_from_min_value(year, month, day) future_date = date past_date = date if ambiguous: result.timex = DateTimeFormatUtil.luis_date(-1, month, day) if future_date < reference: future_date = future_date.replace(year=future_date.year + 1) if past_date >= reference: past_date = past_date.replace(year=past_date.year + 1) else: result.timex = DateTimeFormatUtil.luis_date(year, month, day) result.future_value = future_date result.past_value = past_date result.success = True return result
def render(source, record): katexstorage = {} # convert html character references (ie. >) to unicode source = html.unescape(source) # convert <cp>...</cp> source = re.sub(CP_REGEX, r'<div class="grey-block">\1</div>', source) # convert <cpb>...</cpb> source = re.sub(CPB_REGEX, r'<div class="blue-block">\1</div>', source) # convert <Q>...</Q> source = re.sub(Q_REGEX, r'<blockquote>\1</blockquote>', source) # convert <k>...</k> source = re.sub(K_REGEX, r'<div class="center-paragraph">\1</div>', source) # convert <ind>...</ind> source = re.sub(IND_REGEX, r'<div class="indent-paragraph">\1</div>', source) # convert latex to katex source = re.sub(LATEX_REGEX, lambda match: katexprerender(match, katexstorage), source) # convert ^superscript source = re.sub(SUPERSCRIPT_REGEX, r'<span class="superscript">\1</span>', source) # convert ¬subscript source = re.sub(SUBSCRIPT_REGEX, r'<span class="subscript">\1</span>', source) # convert <m>...</m> and <m name>...</m> source = re.sub(MLINK_REGEX, lambda match: mrender(match, record), source) # convert <g glossary>...</g> source = re.sub(GLINK_REGEX, lambda match: glrender(match, record), source) # convert <ac academy>...</ac> source = re.sub(ACLINK_REGEX, lambda match: societyrender(match, record), source) # convert <E num> source = re.sub(ELINK_REGEX, lambda match: extrarender(match, record), source) # convert <r>...</r> source = source.replace('<r>', '<span class="red-text">') source = source.replace('</r>', '</span>') # convert <bl>...</bl> source = source.replace('<bl>', '<span class="blue-text">') source = source.replace('</bl>', '</span>') # convert <gr>...</gr> source = source.replace('<gr>', '<span class="green-text">') source = source.replace('</gr>', '</span>') # convert <bro>...</bro> source = source.replace('<bro>', '<span class="brown-text">') source = source.replace('</bro>', '</span>') # convert <f+>...</f+> source = re.sub(FPLUS_REGEX, r'<span class="bigger">\1</span>', source) # convert <fp>...</fp> source = re.sub(FP_REGEX, r'<span class="bigger">\1</span>', source) # convert <f++>...</f++> source = re.sub( FPLUSPLUS_REGEX, r'<span class="bigger"><span class="bigger">\1</span></span>', source) # convert <f->...</f-> source = re.sub(FMINUS_REGEX, r'<span class="smaller">\1</span>', source) # convert <fm>...</fm> source = re.sub(FM_REGEX, r'<span class="smaller">\1</span>', source) # convert <c>...</c> source = source.replace('<c>', '<code>') source = source.replace('</c>', '</code>') # convert <ovl>...</ovl> source = source.replace('<ovl>', '<span class="overline">') source = source.replace('</ovl>', '</span>') # convert <d ...> source = re.sub(DIAGRAM_REGEX, lambda match: drender(match, record), source) # convert [refnum] source = re.sub(REF_REGEX, lambda match: referencerender(match, record), source) #source = re.sub(regex, r'<span>[<a href="#reference-\1" class="reference reference-\1">\1</a>]</span>', source) # convert <T num> source = re.sub(TRANS_REGEX, lambda match: trender(match, record), source) # other from the htmlformat function in the stack # new (improved?) break-adder TAGS_MATCHER = r'</?((?:n)|(?:table)|(?:tr)|(?:td(\s+colspan="?\d"?)?)|(?:figure)|(?:p)|(?:br)|(?:li)|(?:ol)|(?:ul)|(?:div(\s+id))|(?:div(\s+class="indent-paragraph"?)?)|(?:div(\s+class="center-paragraph"?)?)|(?:script)|(?:input)|(?:button)|(?:br ?/?)|(?:p)|(?:blockquote)|(?:code)|(?:h\d)|(?:hr ?/?)|(?:area)|(?:map))>' regex = re.compile( r'(?<!%s)\s*?\n(?!\s*%s)' % (TAGS_MATCHER, TAGS_MATCHER), re.MULTILINE | re.DOTALL) source = re.sub(regex, '\n<br>\n', source) # never more than two <br>s together match = re.search(BR_REGEX, source) while match: source = re.sub(BR_REGEX, '<br>\n<br>', source) match = re.search(BR_REGEX, source) # remove all the <n>s source = source.replace('<n>', '') # smart quotes source = source.replace('’', "'") source = source.replace('‘', "'") source = source.replace('“', '"') source = source.replace('”', '"') source = source.replace('<clear>', '<br clear="right">') source = source.replace('<clearl>', '<br clear="left">') source = source.replace('<proofend>', '<d xproofend right><br clear=right>') source = tags_to_unicode(source) source = fix_italics(source, record) # put the katex formulas back in latex_array = list(katexstorage.values()) html_array = katexrender(latex_array) for idx, key in enumerate(katexstorage.keys()): html_formula = html_array[idx] source = source.replace(key, html_formula) return '<span class="markup">%s</span>' % source
def number_with_month(self, source: str, reference: datetime) -> List[Token]: ret: List[Token] = list() extract_results = self.config.ordinal_extractor.extract(source) extract_results.extend(self.config.integer_extractor.extract(source)) for result in extract_results: num = int(self.config.number_parser.parse(result).value) if num < 1 or num > 31: continue if result.start >= 0: front_string = source[0:result.start or 0] match = regex.search(self.config.month_end, front_string) if match is not None: ret.append( Token(match.start(), match.end() + result.length)) continue # handling cases like 'for the 25th' matches = regex.finditer(self.config.for_the_regex, source) is_found = False for match_case in matches: if match_case is not None: ordinal_num = RegExpUtility.get_group( match_case, 'DayOfMonth') if ordinal_num == result.text: length = len( RegExpUtility.get_group(match_case, 'end')) ret.append( Token(match_case.start(), match_case.end() - length)) is_found = True if is_found: continue # handling cases like 'Thursday the 21st', which both 'Thursday' and '21st' refer to a same date matches = regex.finditer( self.config.week_day_and_day_of_month_regex, source) for match_case in matches: if match_case is not None: ordinal_num = RegExpUtility.get_group( match_case, 'DayOfMonth') if ordinal_num == result.text: month = reference.month year = reference.year # get week of day for the ordinal number which is regarded as a date of reference month date = DateUtils.safe_create_from_min_value( year, month, num) num_week_day_str: str = calendar.day_name[ date.weekday()].lower() # get week day from text directly, compare it with the weekday generated above # to see whether they refer to a same week day extracted_week_day_str = RegExpUtility.get_group( match_case, 'weekday').lower() if (date != DateUtils.min_value and self.config.day_of_week[num_week_day_str] == self.config. day_of_week[extracted_week_day_str]): ret.append( Token(match_case.start(), match_case.end())) is_found = True if is_found: continue # handling cases like '20th of next month' suffix_str: str = source[result.start + result.length:].lower() match = regex.match(self.config.relative_month_regex, suffix_str.strip()) space_len = len(suffix_str) - len(suffix_str.strip()) if match is not None and match.start() == 0: ret.append( Token( result.start, result.start + result.length + space_len + len(match.group()))) # handling cases like 'second Sunday' match = regex.match(self.config.week_day_regex, suffix_str.strip()) if (match is not None and match.start() == 0 and num >= 1 and num <= 5 and result.type == NumberConstants.SYS_NUM_ORDINAL): week_day_str = RegExpUtility.get_group(match, 'weekday') if week_day_str in self.config.day_of_week: ret.append( Token( result.start, result.start + result.length + space_len + len(match.group()))) if result.start + result.length < len(source): after_string = source[result.start + result.length:] match = regex.match(self.config.of_month, after_string) if match is not None: ret.append( Token( result.start, result.start + result.length + len(match.group()))) return ret
for aktuelle_zahl in range(0, 10): verzeichnis = './TESTSET/' + str(aktuelle_zahl) + '/' print("Verzeichnis: ", verzeichnis) regex_png = regex.compile('f*.png') with os.scandir(verzeichnis) as entries: #print("Anzahl der Einträge ", len(entries)) for entry in entries: if regex.search(regex_png, entry.name) : pfad = verzeichnis + entry.name image = mpimg.imread(pfad) image *= 255 # für Testing x_test[test_index_dataset] = image y_test[test_index_dataset] = aktuelle_zahl test_index_dataset=test_index_dataset+1 # für Training - ab Index 10.000 x_train[train_index_dataset] = image y_train[train_index_dataset] = aktuelle_zahl train_index_dataset = train_index_dataset + 1
def add_cpgs(objs): sp_cc = 0 cc = 0 ## 这里是给出了由训练集得出的一些明显不可能是出品公司的关键字,其中和文化、和传媒等关键字因为其中的和会被当成连接字符给且分开所以也屏蔽掉 forbid_kws = [ '》', '制作', '投资', '在', '担任', '联手', '怎么', ] forbid_kws += ['摄制', '电视剧', '和传媒', '旗下', '庆祝', '和平', '自家', '主演', '和文化'] relist, _ = get_re() relist = relist['出品公司'] strip_kw = ['一部', '一部由', '由', '联合', '共同', '独家', '合作', '著', '创作'] for obj in objs: havecp = False for spo in obj['spo_list']: if spo['predicate'] == '出品公司': havecp = True if havecp or '出品' in obj['text']: # print_spo(obj['spo_list'],'出品公司') # print( obj['text'] ) texts = split_text(obj['text'], r"[?。,]") for text in texts: text = text[text.find('》') + 1:] for restr in relist: reresult = regex.search(restr[0], text) if reresult != None: reresult = str(reresult.group()) ##等后面就不要了 if '等' in reresult: reresult = reresult[:reresult.find('等')] cflag = True for fkw in forbid_kws: if fkw in reresult: cflag = False break if not cflag: break for rss in [ r'\d{4}年\d{1,2}月\d{1,2}日', r'\d{4}年\d{1,2}月', r'\d{4}年' ]: yearresult = re.search(rss, reresult) if yearresult is not None: yearresult = yearresult.group() if '于' + yearresult in reresult: reresult = reresult[:reresult. find('于' + yearresult)] reresult = reresult.replace(yearresult, '') break ##去掉 for skw in strip_kw: if skw in reresult: reresult = reresult.strip(skw) reresult = reresult.rstrip('于') if reresult != '': reresult = set(split_text(reresult, r"[、与和及]")) temp_set = [] for rst in reresult: ##如果是一个数字 if '' != rst and not rst[0].isdigit(): if '联合' in rst: temp_set.extend(rst.split('联合')) else: temp_set.append(rst) reresult = set(temp_set) # print('!!!!!',reresult) ents = set() for spo in obj['spo_list']: if spo['predicate'] == '出品公司': ents.add(spo['object']) dif = reresult - ents if len(dif) > 0: cc += len(dif) # print(dif ,reresult,obj['text']) ## 处理那些有same的 same = reresult & ents if len(same) > 0: sp_set = set() for sm in same: for spo in obj['spo_list']: if sm == spo['object']: sp_set.add((spo['subject'], spo['predicate'])) for df in dif: for sp in sp_set: add_spo(obj, sp[0], df, sp[1]) sp_cc += 1 # print('!!! sp',sp_set) else: ##如果只有一个书名号,就取书名号中的内容 shumings = list( set( regex.findall( "(?<=《).*?(?=》)", obj['text']))) if len(shumings) == 1: for df in dif: add_spo(obj, shumings[0], df, '出品公司') # print(shumings[0],df) print(shumings, dif, obj['text']) sp_cc += len(dif) print('sss', cc, sp_cc)
text=text.decode("ascii", "ignore") return text except Exception as e: print(e) pass outputDictionary={} claim=convert_pdf_to_txt_v2("/home/ubuntu/environment/Claim.pdf") claim=repr(claim) #Regular Expressions claimUpdated=re.sub(r"\\n\\n",r"\\n",claim) insuredName=re.search(r"(?<=Name and address of Insured\\n)[A-Za-z ]+",claimUpdated)[0] outputDictionary["Insured Name:"]=insuredName insuredAddressL1=regex.search(r"Name and address of Insured\\n[A-Za-z ]+\\n\K\S.*?(?=\\n)",claimUpdated)[0] insuredAddressL2=regex.search(r"Name and address of Insured\\n[A-Za-z ]+\\n[1-9A-Za-z ]+\\n\K\S.*?(?=\\n)",claimUpdated)[0] insuredAddressL3=regex.search(r"Name and address of Insured\\n[A-Za-z ]+\\n[1-9A-Za-z ]+\\n[A-Za-z ]+\\n\K\S.*?(?=\\n)",claimUpdated)[0] insuredAddress=insuredAddressL1+" "+insuredAddressL2+" "+insuredAddressL3 outputDictionary["Insured Address:"]=insuredAddress dob=regex.search(r"DOB:\s*\K\S.*?(?=\\n)",claimUpdated)[0] outputDictionary["Date of birth:"]=dob employersName=regex.search(r"Name:\s*\K\S.*?(?=\\n)",claimUpdated)[0] outputDictionary["Employer's Name:"]=employersName employersLocation=regex.search(r"Location:\s*\K\S.*?(?=\\n)",claimUpdated)[0] outputDictionary["Employer's Location:"]=employersLocation claim=re.search(r"(?<=Medical History).*$",claim)[0]