def siman_smk_exctractor(smk_text):

    split = re.split(u'\s', smk_text)
    simanim = []
    for word in split:
        if not word or word == u'סימן' or word == u'סעיף':
            continue
        word = re.sub(u"[;.,']", u"", word)
        if re.search(u'-', word):
            borders = re.search(u"(.*?)-(.*)", word)
            start = getGematria(borders.group(1))
            end = getGematria(borders.group(2))
            for siman in range(start, end+1):
                simanim.append(siman)
        if not is_hebrew_number(word):
            if not check_vav(word):
                # print smk_text, simanim
                return simanim
            else:
                simanim.append(check_vav(word))
        else:
            smk_siman = getGematria(word)
            simanim.append(smk_siman)
    # print smk_text, simanim
    return simanim
Ejemplo n.º 2
0
def grep_author_note(s):
  author = ''
  g = 3
  while True:
    s = re.sub(eres['clean_author'], '', s)
    tmp = re.search(eres['author'], s)
    if tmp:
      # all the following mess is needed because of too slow regex engine
      #   (he gives up and tries each possible combination if the
      #   forbidden part is written inside of the `author' regex)
      # TODO handle situation '..., Surname A Paper Name ...' (where `A'
      #   belongs to Paper Name...)
      if re.search(eres['author_forbidden_rest'], s[tmp.end(g):]):
        break
      ss = tmp.group(g)
      tmpp = re.search(eres['author_forbidden'], ss)
      if tmpp:
        maybe_empty = ss[:tmpp.start()]
        if author and maybe_empty:
          author += AUTHOR_JOIN
        author += maybe_empty
        s = s[tmp.start(g) + tmpp.start(2):]
        break
      else:
        if author:
          author += '; '
        author += ss.strip(' \t')
        # save only the rest (drop prefixes like `and' `with' etc.)
        s = s[tmp.end(g):]
    else:
      break
  return author, s
Ejemplo n.º 3
0
def clean_line(line):
    line = strip_nikkud(line)
    replace_dict = {u'[.:\?]': u'', u'[”״]': u'"', u'[’׳]': u"'"} #note put \. in the file/ how can i check if it is right?
    line = multiple_replace(line, replace_dict, using_regex=True)
    # line = re.sub(u'[:\?]', '', line)
    # line = re.sub(u'”', u'"', line)
    reg_parentheses = re.compile(u'\((.*?)\)')
    reg_brackets = re.compile(u'\[(.*?)\]')
    in_per = reg_parentheses.search(line)
    in_bra = reg_brackets.search(line)
    reg_ayyen_tur = re.compile(u'''ו?(עיין|עי'|ע"ש) בטור''')
    reg_lo_manu = re.compile(u'''(?P<a>(\u05d0\u05da )?\u05dc\u05d0 \u05de\u05e0(.*?))(\u05e1\u05de"?\u05d2|\u05e8\u05de\u05d1"?\u05dd|\u05d8\u05d5\u05e8|\n)''')
    line = re.sub(u'\[.*?אלפס.*?\]', u'', line)
    line = re.sub(u'טור ו?שו"ע', u'טוש"ע', line)
    f_ayyen = re.search(reg_ayyen_tur, line)
    f_lo_manu = re.search(reg_lo_manu, line)

    if f_ayyen:
        line = line[:f_ayyen.start()]
    if f_lo_manu:
        line = re.sub(f_lo_manu.group('a'), u"", line)
    if in_per:
        if in_bra:
            clean = re.sub(reg_brackets, ur'\1', line)  # brackets are always correct
            clean = re.sub(reg_parentheses, '', clean)
        else:
            clean = re.sub(reg_parentheses, ur'\1', line)
    elif in_bra:
        clean = re.sub(reg_brackets, ur'\1', line)  # brackets are always correct
    else:
        clean = line
    return clean
Ejemplo n.º 4
0
def grep_year_note(s, reverse = False, with_paren = True):
  s = clean_start_end(s, start = False)
  s = re.sub(eres['clean_author'], '', s)
  if reverse:
    postfix = 'r'
  else:
    postfix = ''

  if not with_paren:
    postfix += '_no_paren'

  tmp = re.search(eres['year4' + postfix], s)
  if tmp:
    year = tmp.group(1)
    note = s[:tmp.start()] + s[tmp.end():]
  else:
    tmp = re.search(eres['year2' + postfix], s)
    if tmp:
      year = '20' + tmp.group(1)
      note = s[:tmp.start()] + s[tmp.end():]
    else:
      year = ''
      note = s

  return year, note
Ejemplo n.º 5
0
 def __check_error(self, str):
     if str[0] in string.ascii_lowercase:
         # At least one occurence must be there for sure.
         m = regex.search(irrelevant_id_symbols, str)
         err_code = ERR_IRRELEVANT_SYMBOL
         err_pos = m.start()
     elif str[0] == 'R':
         if str[1] == 'T':
             if str[2] == '_':
                 # At least one occurence must be there for sure.
                 m = regex.search(irrelevant_id_symbols, str[3:])
                 err_pos = m.start()
             else:
                 err_pos = 2
         else:
             err_pos = 1
         err_code = ERR_IRRELEVANT_SYMBOL        
     elif str[0] == '"':
         if str.find('"', 1) == -1:
             err_code = ERR_QUOTES_NOT_CLOSED
             err_pos = 0
         else:
             # At least one occurence must be there for sure.
             m = regex.search(irrelevant_str_symbols, str[1:])
             err_code = ERR_IRRELEVANT_SYMBOL
             err_pos = m.start() + 1
     else:
         err_code = ERR_IRRELEVANT_SYMBOL
         err_pos = 0
     return (err_code, err_pos)
Ejemplo n.º 6
0
def main():
	metaHtmlFilename = "Quaero/web/quaero_meta.html"
	standardHtmlFilename = "Quaero/web/quaero.html"
	latex2htmlLogFilename = "Quaero/doc/manual/html.log"
	metaHtml = open(metaHtmlFilename,'r')
	standardHtml = open(standardHtmlFilename,'w')
	classified=0
	for line1 in metaHtml.readlines():
		keys = re.search("helpwindow_MANUAL\((.*)\)\.html",line1)
		if(not keys==None):
			key = keys.group(1)
			key = regsub.gsub("\+","\\+",key)
			latex2htmlLog = open(latex2htmlLogFilename,'r')
			foundNodeNumber = 0
			for line2 in latex2htmlLog.readlines():
				nodeNumber = re.search('"'+key+'" for node([0-9]*)\.html',line2)
				if(not nodeNumber==None):
					line1 = regsub.gsub("helpwindow_MANUAL("+key+").html","manual/manual/node"+nodeNumber.group(1)+".html",line1)
					foundNodeNumber = 1
			if(foundNodeNumber==0):
				print 'Key "'+key+'" not found.'
			latex2htmlLog.close()
		if regex.search("BeginClassified",line1) >= 0:
			classified=1
		if regex.search("EndClassified",line1) >= 0:
			classified=0
		if(classified==0):
			standardHtml.write(line1)
		if regex.search("</html>",line1) >= 0:
			break
	metaHtml.close()
	standardHtml.close()
    def parse_implied_depth(self, element):
        ja_depth_pattern = ur"\[(\d)\]$"
        ja_sections_pattern = ur"\[(.*)\]$"
        title_str = element.get('text').strip()

        depth_match = re.search(ja_depth_pattern, title_str)
        if depth_match:
            depth = int(depth_match.group(1))
            placeholder_sections = ['Volume', 'Chapter', 'Section', 'Paragraph']
            element.set('text', re.sub(ja_depth_pattern, "", title_str))
            return {'section_names': placeholder_sections[(-1 * depth):], 'address_types' : ['Integer'] * depth}

        sections_match = re.search(ja_sections_pattern, title_str)
        if sections_match:
            sections = [s.strip() for s in sections_match.group(1).split(",")]
            element.set('text', re.sub(ja_sections_pattern, "", title_str))
            section_names = []
            address_types = []
            for s in sections:
                tpl = s.split(":")
                section_names.append(tpl[0])
                address_types.append(tpl[1] if len(tpl) > 1 else 'Integer')

            return {'section_names': section_names, 'address_types' : address_types}
        else:
            return None
Ejemplo n.º 8
0
 def get_sham_ref_with_node(st, node, lang='he'):
     """
     used when you know the node that a sham ref belongs to in order to parse the ref according to the address types
     of that node
     :param st: string to search for sham ref
     :param node: node that we believe this sham ref belongs to
     :param lang:
     :return:
     """
     title_sham = u'שם'
     title_reg = CitationFinder.get_ultimate_title_regex(title_sham, node, lang, compiled=True)
     if node.full_title() in [u'I Samuel', u'II Samuel', u'I Kings', u'II Kings', u'I Chronicles', u'II Chronicles']:
         volume = re.search(u'שם (א|ב)\s', st)
         m = re.search(title_reg, st)
         if volume:
             st1 = re.sub(u'(א|ב)\s', u'', st, count=1, pos=m.start())
             m1 = re.search(title_reg, st1)
             if m1 and m1.groupdict()['a0'] and m1.groupdict()['a1']:
                 node = CitationFinder.node_volumes(node, volume.group(1))
                 return CitationFinder.parse_sham_match(m1, lang, node)
         if m:
             return CitationFinder.parse_sham_match(m, lang, node)  # there should be one and only one match
     else:
         title_reg = CitationFinder.get_ultimate_title_regex(title_sham, node, lang, compiled=True)
         m = re.search(title_reg, st)
         if m:
             return CitationFinder.parse_sham_match(m, lang, node)
     raise InputError
def strip_derived_rvs(rvs):
    '''Convenience fn: remove PyMC3-generated RVs from a list'''
    ret_rvs = []
    for rv in rvs:
        if not (re.search('_log',rv.name) or re.search('_interval',rv.name)):
            ret_rvs.append(rv)     
    return ret_rvs
Ejemplo n.º 10
0
def plot_phonemes(path):
    phoneme_embeddings = dict()
    for line in codecs.open(path,"r"):
        line = line.split(",")
        key= line[0][1:-1]
        emb = line[1:]
        emb[-1] = emb[-1][:-1]
        emb = np.array([float(e) for e in emb])
        phoneme_embeddings[key] = emb
    
    phoneme_embeddings = DataFrame(phoneme_embeddings,columns=phoneme_embeddings.keys())
    print(phoneme_embeddings.columns)
    
    m = TSNE()
    phoneme_embeddings_tsne = m.fit_transform(phoneme_embeddings.transpose())
    print(len(phoneme_embeddings_tsne))
    for p,emb in zip(phoneme_embeddings.columns, phoneme_embeddings_tsne):
        c = "black"
        if regex.search("^[aeiou3E][*]?$", p):
            c = "red"
            plt.annotate(p,(emb[0],emb[1]),color=c)
        if regex.search("^.*w~$", p):
            c = "blue"
            plt.annotate(p,(emb[0],emb[1]),color=c)
        if regex.search("^.*y~$", p):
            c = "yellow"
            plt.annotate(p,(emb[0],emb[1]),color=c)
        if regex.search("^.*h~$", p):
            c = "brown"
            plt.annotate(p,(emb[0],emb[1]),color=c)
        if regex.search("^.*\"$", p):
            c = "green"
            plt.annotate(p,(emb[0],emb[1]),color=c)
Ejemplo n.º 11
0
def parseaddr(address):
	# This is probably not perfect
	address = string.strip(address)
	# Case 1: part of the address is in <xx@xx> form.
	pos = regex.search('<.*>', address)
	if pos >= 0:
		name = address[:pos]
		address = address[pos:]
		length = regex.match('<.*>', address)
		name = name + address[length:]
		address = address[:length]
	else:
		# Case 2: part of the address is in (comment) form
		pos = regex.search('(.*)', address)
		if pos >= 0:
			name = address[pos:]
			address = address[:pos]
			length = regex.match('(.*)', name)
			address = address + name[length:]
			name = name[:length]
		else:
			# Case 3: neither. Only an address
			name = ''
	name = string.strip(name)
	address = string.strip(address)
	if address and address[0] == '<' and address[-1] == '>':
		address = address[1:-1]
	if name and name[0] == '(' and name[-1] == ')':
		name = name[1:-1]
	return name, address
Ejemplo n.º 12
0
def derive_new_rasag_halakhah_links(sources, generated_msg='rsg_sfm_linker'):
    """
    This function returns links between the rasag and the halakhah links of the sources.
    the sources param are the "middleman" rasag-sources-halakhah and this function creates the links rasag-halakhah
    :param sources: a list of texts or categories on Sefaria
    :param generated_msg: a string to put on the link for the generated_by message
    :return: links rasag-halakhah
    """
    new_links = []
    source_links = get_links(sources)
    for link in source_links:
        rsg, otherref = link['refs'] if re.search("Sefer Hamitzvot", link['refs'][0]) else [link['refs'][1], link['refs'][0]]
        ls_otherref = LinkSet(Ref(otherref)).filter('Halakhah')
        # print otherref
        c = set([l.refs[0] for l in ls_otherref] + [l.refs[1] for l in ls_otherref])
        # cluster_refs = [Ref(r) for r in c]
        # create_link_cluster(cluster_refs, 30044, "Sifrei Mitzvot",
        #                     attrs={"generated_by": "rsg_sfm_linker"}, exception_pairs=[("Tur", "Shulchan Arukh")])
        c = c.difference({otherref, rsg})
        for ref_string in list(c):
            if re.search("Sefer Hamitzvot", ref_string) or not re.search("Sefer Hamitzvot", rsg):
                continue
            link=({
                "refs":[
                    rsg,
                    ref_string
                ],
                "type": "Sifrei Mitzvot",
                "auto": True,
                "generated_by": generated_msg
            })
            # print link['refs']
            new_links.append(link)
    print len(new_links)
    return new_links
Ejemplo n.º 13
0
def parse_movie(search_name):
    """
    Parse a movie name into name/year.

    :param search_name: release name
    :return: (name, year)
    """
    result = regex.search('^(?P<name>.*)[\.\-_\( ](?P<year>19\d{2}|20\d{2})', search_name, regex.I)
    if result:
        result = result.groupdict()
        if 'year' not in result:
            result = regex.search(
                '^(?P<name>.*)[\.\-_ ](?:dvdrip|bdrip|brrip|bluray|hdtv|divx|xvid|proper|repack|real\.proper|sub\.?fix|sub\.?pack|ac3d|unrated|1080i|1080p|720p|810p)',
                search_name, regex.I)
            if result:
                result = result.groupdict()

        if 'name' in result:
            name = regex.sub('\(.*?\)|\.|_', ' ', result['name'])
            if 'year' in result:
                year = result['year']
            else:
                year = ''
            return {'name': name, 'year': year}

    return None
Ejemplo n.º 14
0
def tabbed_output(columns, output_file, match_re_column_list=None, legacy_handling=None, match_any=False):
    if legacy_handling:
        match_re_column_list = [(match_re_column_list, legacy_handling)]
    try:
        if isinstance(match_re_column_list[0], str):
            match_re_column_list = [match_re_column_list]
    except TypeError:
        pass
    if isinstance(columns, dict):
        if not isinstance(columns, OrderedDict):
            raise TypeError("dictionary argument of 'tabbed_output()' must be an OrderedDict")
        if match_re_column_list:
            match_re_column_list = [(rx, columns[match_column]) for rx, match_column in match_re_column_list]
        columns = columns.values()
    lengths = [len(column) for column in columns]
    if len(set(lengths)) == 1:
        for i in range(lengths[0]):
            for column in columns:
                if isinstance(column[i], str):
                    column[i] = [column[i]]
                column[i] = "‣".join(column[i])
            line = "\t".join([clean_str(column[i]) for column in columns])
            if match_re_column_list:
                if match_any:
                    if any([re.search(rx, match_column[i]) for rx, match_column in match_re_column_list]):
                        print(line, file=output_file)
                else:
                    if all([re.search(rx, match_column[i]) for rx, match_column in match_re_column_list]):
                        print(line, file=output_file)
            else:
                print(line, file=output_file)
    else:
        raise IndexError("first argument of 'tabbed_output()' must be an OrderedDict or list of identical-length lists")
def plot_traces_pymc(trcs, varnames=None):
    ''' Convenience fn: plot traces with overlaid means and values 
        Handle nested traces for hierarchical models
    '''

    nrows = len(trcs.varnames)
    if varnames is not None:
        nrows = len(varnames)
    
    ax = pm.traceplot(trcs, varnames=varnames, figsize=(12, nrows*1.4),
                      lines={k: v['mean'] for k, v in 
                                pm.df_summary(trcs,varnames=varnames).iterrows()},
                      combined=True)

    # don't label the nested traces (a bit clumsy this: consider tidying)
    dfmns = pm.df_summary(trcs, varnames=varnames)['mean'].reset_index()
    dfmns.rename(columns={'index':'featval'}, inplace=True)
    dfmns = dfmns.loc[dfmns['featval'].apply(lambda x: re.search('__[1-9]{1,}', x) is None)]
    dfmns['draw'] = dfmns['featval'].apply(lambda x: re.search('__0{1}$', x) is None)
    dfmns['pos'] = np.arange(dfmns.shape[0])
    dfmns.set_index('pos', inplace=True)

    for i, r in dfmns.iterrows():
        if r['draw']:
            ax[i,0].annotate('{:.2f}'.format(r['mean']), xy=(r['mean'],0)
                    ,xycoords='data', xytext=(5,10)
                    ,textcoords='offset points', rotation=90
                    ,va='bottom', fontsize='large', color='#AA0022') 
Ejemplo n.º 16
0
def scrape_wiki():
    url = u"https://he.wikipedia.org/wiki/%D7%9E%D7%A0%D7%99%D7%99%D7%9F_%D7%94%D7%9E%D7%A6%D7%95%D7%95%D7%AA_%D7%A2%D7%9C_%D7%A4%D7%99_%D7%A1%D7%A4%D7%A8_%D7%94%D7%97%D7%99%D7%A0%D7%95%D7%9A"

    page = requests.get(url)
    soup_body = BeautifulSoup(page.text, "lxml")
    tables = soup_body.select(".mw-parser-output > table")

    pairs = []
    links = []

    for table in tables:
        table_tr = table.select("tr")
        for col in table_tr:
            pairs.append((col.contents[1].text.strip(), re.sub(u'</?td>', u'', col.contents[-1].text).strip()))

    for pair in pairs:
        if re.search(u'ספר|מספר', pair[0]):
            continue
        neg_pos = u"Negative Mitzvot" if re.search(u"לאו", pair[1]) else u'Positive Mitzvot'
        rambam = getGematria(re.sub(u'עשה|לאו', u'', pair[1]).strip())
        chinukh = getGematria(pair[0])
        print chinukh, rambam
        chinukh_simanlen = len(Ref(u'Sefer HaChinukh.{}'.format(chinukh)).all_segment_refs())
        print neg_pos
        link = ({"refs": [
            u'Sefer HaChinukh.{}.{}-{}'.format(chinukh, 1, chinukh_simanlen),
            u'Mishneh Torah, {}.{}'.format(neg_pos, rambam)
        ],
            "type": "Sifrei Mitzvot",
            "auto": True,
            "generated_by": "chinukh_rambam_sfm_linker"  # _sfm_linker what is this parametor intended to be?
        })
        print link['refs']
        links.append(link)
        return links
Ejemplo n.º 17
0
def lcc_range(string):
    """
    Takes a string, returns a tuple of two LCClassNumbers, the start and
    end of the range.
    """
    string = string.encode("ascii","replace")
    string = string.replace("(","")
    string = string.replace(")","")
    if string.endswith("A-Z"):
        # TMI in the schedules when they're alphabetical.
        # I don't care.
        string.replace("A-Z","")

    if "-" not in string:
        # A range of self length.
        return (LCCallNumber(string), LCCallNumber(string))

    parts = string.split("-")
    if re.search(r"^\d",parts[1]):
        header = re.sub("^([A-Z]+).*",r"\1",parts[0])
    elif re.search(r"^\.",parts[1]):
        header = re.sub(r"^([A-Z]+\d)+\..*",r"\1",parts[0])
    elif re.search(r"^[A-Z]",parts[1]):
        header = re.sub(r"^([A-Z]+\d)+\..[A-Z]*",r"\1.",parts[0])            
    else:
        header = " "

    parts[1] = header + parts[1]
    return (
        LCCallNumber(parts[0]),
        LCCallNumber(parts[1])
    )
Ejemplo n.º 18
0
def _correct_splitlines_in_headers(markers, lines):
    """
    Corrects markers by removing splitlines deemed to be inside header blocks.
    """
    updated_markers = ""
    i = 0
    in_header_block = False

    for m in markers:
        # Only set in_header_block flag when we hit an 's' and line is a header
        if m == 's':
            if not in_header_block:
                if bool(re.search(RE_HEADER, lines[i])):
                    in_header_block = True
            else:
                if QUOT_PATTERN.match(lines[i]):
                    m = 'm'
                else:
                    m = 't'

        # If the line is not a header line, set in_header_block false.
        if not bool(re.search(RE_HEADER, lines[i])):
            in_header_block = False

        # Add the marker to the new updated markers string.
        updated_markers += m
        i += 1

    return updated_markers
Ejemplo n.º 19
0
def tabularize(source, output_file, match_re_lv_list=None, lv_list=None, tagged=False, tag_types=set(), inc_df=False):
    if not lv_list:
        lv_list = sorted(set(flatten([mn.lv_list() for mn in source])))
    try:
        if isinstance(match_re_lv_list[0], str):
            match_re_lv_list = [match_re_lv_list]
    except TypeError: pass
    try:
        l = tqdm(source)
    except NameError:
        l = self
    for mn in l:
        line = tab_line(mn, lv_list, tagged, tag_types)
        if match_re_lv_list:
            match = False
            for rx, match_lv in match_re_lv_list:
                for ex in [dn.ex for dn in mn.dn_list if dn.lv == match_lv]:
                    if re.search(rx, clean_str(str(ex))): match = True
                if inc_df:
                    for df in [df for df in mn.df_list if df.lv == match_lv]:
                        if re.search(rx, clean_str(str(df))): match = True
            if match and clean_str(line): print(line, file=output_file)
        elif clean_str(line):
            print(line, file=output_file)
        else: pass
Ejemplo n.º 20
0
def tokenize(text):
    # This line changes tabs into spaces
    text = re.sub(r"\t", " ", text)
    # put blanks around characters that are unambiguous separators
    text = re.sub(always_sep, r" \g<0> ", text)
    # if a word is a separator in the beginning of a token separate it here
    text = re.sub("^" + begin_sep, r"\g<0> ", text)
    text = re.sub(" " + begin_sep, r"\g<0> ", text)
    text = re.sub("(" + not_letter + ")(" + begin_sep + ")", r"\1 \2", text)
    # idem for final separators
    text = re.sub(end_sep + r"\s", r" \g<0>", text)
    text = re.sub(end_sep + "(" + not_letter + ")", r"\1 \2",
                  text)  # the end separator is already between parentheses and is stored in $1

    # This line divides the input line and assigns it to elements of an array
    all_words = text.split()
    words = []
    # We examine all the elements
    for word in all_words:
        # if it contains a letter followed by a period,
        if re.search(letter + r"\.", word):
            # we see if it is an abbreviation
            # if it is explicitly found in the abbreviation list
            if word not in abbr:
                # or matches the regular expression below, we keep the period attached (possible acronyms)
                if not re.search(r"^(\p{L}\.(\p{L}\.)+|\p{Lu}[bcdfghj-np-tvxz]+\.)$", word):
                    # if not, a space is inserted before the period
                    word = re.sub(r"\.$", r" .", word)
        # Change all spaces to new lines
        word = re.sub(r"[ \t]+", r"\n", word)
        # Append the current word
        words.append(word)
    return words
Ejemplo n.º 21
0
def simple_rule(rule, targets):
    """Is this rule a simple rule?  A simple rule rewrites a single hostname,
    perhaps with an optional leading www\., to itself or to itself plus www.,
    at the top level with no other effects."""
    rule_from = rule.attrib["from"]
    rule_to = rule.attrib["to"]
    # Simple rule with no capture
    if regex.match(r"^\^http://[-A-Za-z0-9.\\]+/$", rule_from):
        applicable_host = unescape(regex.search(r"^\^http://([-A-Za-z0-9.\\])+/$", rule_from).groups()[0])
        if regex.match(r"^https://%s/" % applicable_host, rule_to) or regex.match(
            "r^https://%s/" % applicable_host, rule_to
        ):
            return True
        else:
            return False
    # Optional www
    if regex.match(r"^\^http://\(www\\\.\)\?[-A-Za-z0-9.\\]+/$", rule_from):
        applicable_host = unescape(regex.search(r"^\^http://\(www\\\.\)\?([-A-Za-z0-9.\\]+)/$", rule_from).groups()[0])
        if regex.match(r"^https://www\.%s/" % applicable_host, rule_to) or regex.match(
            r"^https://%s/" % applicable_host, rule_to
        ):
            return True
        else:
            return False
    return False
Ejemplo n.º 22
0
 def parse_semag(self, str, mass):
     # split = re.split('\s', str.strip())
     reg_book = re.compile(u'ו?(עשין|שם|לאוין)')
     split = re.split(reg_book, str.strip())
     # str_list = filter(None, split)
     str_list = filter(None, [item.strip() for item in split])
     resolveds = []
     # it = iter(str_list)
     derabanan_flag = False
     book = None
     for i, word in enumerate(str_list):
         if derabanan_flag:
             derabanan_flag = False
             resolved = self._tracker.resolve(book, [1])
             resolveds.append(resolved)
             continue
         elif re.search(reg_book, word):
             # book = word
             # if book == u'שם':
             #     book = None
             # elif book == u'לאוין':
             #     book = u'Sefer Mitzvot Gadol, Volume One'
             try:
                 if word != u'שם':
                     derabanan = filter(None, [item.strip() for item in re.split(u'(מד"ס|מ?דרבנן)',str_list[i+1].strip())])
             except IndexError:
                 # mass.ErrorFile.write('error smg, no place in book notation')
                 mass.error_flag = 'error smg, no place in book notation'
                 print 'error smg, no place in book notation'
                 return
             if word == u'עשין' and len(derabanan) > 1 and (derabanan[0] != u"סימן"):
                 book = re.search(u'[א-ה]',derabanan[1])
                 # print book.group(0)
                 book = self._table[book.group(0)]
                 derabanan_flag = True
             elif re.match(reg_book, word):
                 book = self._table[word]
             else:
                 mass.ErrorFile.write("error smg, don't recognize book name")
                 print "error smg, don't recognize book name", book
                 return
         else:
             mitzva = re.split('\s', word)
             for m in mitzva:
                 # if m == u'סימן':
                 #     continue
                 if m == u'שם':
                     m = None
                 elif getGematriaVav(m):
                     m = getGematriaVav(m)
                 else:
                     m = None
                 resolved = self._tracker.resolve(book, [m])
                 resolveds.append(resolved)
     if not resolveds:
         resolved = self._tracker.resolve(book, [None])
         resolveds.append(resolved)
     # print resolveds
     return resolveds
Ejemplo n.º 23
0
def create_activation(data, labels, standard_cols, group_labels=[]):

    activation = database.Activation()

    for i, col in enumerate(data):

        # Cast to integer or float if appropriate
        # if regex.match('[-\d]+$', col):
        #     col = int(col)
        # elif regex.match('[-\d\.]+$', col):
        #     col = float(col)

        # Set standard attributes if applicable and do validation where appropriate.
        # Generally, validation will not prevent a bad value from making it into the
        # activation object, but it will flag any potential issues using the "problem" column.
        if standard_cols[i] is not None:

            sc = standard_cols[i]

            # Validate XYZ columns: Should only be integers (and possible trailing decimals).
            # If they're not, keep only leading numbers. The exception is that ScienceDirect 
            # journals often follow the minus sign with a space (e.g., - 35), which we strip.
            if regex.match('[xyz]$', sc):
                m = regex.match('(-)\s+(\d+\.*\d*)$', col)
                if m:
                    col = "%s%s" % (m.group(1), m.group(2))
                if not regex.match('(-*\d+)\.*\d*$', col):
                    logging.debug("Value %s in %s column is not valid" % (col, sc))
                    activation.problems.append("Value in %s column is not valid" % sc)
                    # col = regex.search('(-*\d+)', col).group(1)
                    return activation
                col = (float(col))

            elif sc == 'region':
                if not regex.search('[a-zA-Z]', col):
                    logging.debug("Value in region column is not a string")
                    activation.problems.append("Value in region column is not a string")

            setattr(activation, sc, col)


        # Always include all columns in record
        activation.add_col(labels[i], col)
      
        # Handle columns with multiple coordinates (e.g., 45;12;-12).
        # Assume that any series of 3 numbers in a non-standard column
        # reflects coordinates. Will fail if there are leading numbers!!!
        # Also need to remove space between minus sign and numbers; some ScienceDirect
        # journals leave a gap.
        if not i in standard_cols:
            cs = '([\-\.\s]*\d{1,3})'
            m = regex.search('%s[,;\s]+%s[,;\s]+%s' % (cs, cs, cs), unicode(col).strip())
            if m:
                x, y, z = [regex.sub('-\s+', '-', c) for c in [m.group(1), m.group(2), m.group(3)]]
                logger.info("Found multi-coordinate column: %s\n...and extracted: %s, %s, %s" % (col, x, y, z))
                activation.set_coords(x, y, z)

    activation.groups = group_labels
    return activation
Ejemplo n.º 24
0
    def parse_semag(self, str, mass):
        reg_book = re.compile(u'ו?ב?(עשין|שם|לאוין|לאין)')
        split = re.split(reg_book, str.strip())
        str_list = filter(None, [item.strip() for item in split])
        resolveds = []
        derabanan_flag = False
        book = None
        reg_siman = u"סי'?|סימן"
        reg_vav = u'ו{}'.format(reg_siman)
        for i, word in enumerate(str_list):
            if derabanan_flag:
                derabanan_flag = False
                # resolved = self._tracker.resolve(book, [1])
                resolved = resolveExceptin(self._tracker, book, [1])
                resolveds.append(resolved)
                continue
            elif re.search(reg_book, word):
                try:
                    if word != u'שם':
                        derabanan = filter(None, [item.strip() for item in re.split(u'(מד"ס|מ?דרבנן)',str_list[i+1].strip())])
                except IndexError:
                    mass.write_shgia('error smg, no place in book notation')
                    return
                if word == u'עשין' and len(derabanan) > 1:
                    book = re.search(u'[א-ה]',derabanan[1])
                    book = self._table[book.group(0)]
                    derabanan_flag = True
                elif re.match(reg_book, word):
                    book = self._table[word]
                else:
                    mass.write_shgia("error smg, don't recognize book name")
                    return
            else:
                mitzva = re.split('\s', word)
                for m in mitzva:
                    if re.search(reg_vav, m) and not book:
                        # resolved = self._tracker.resolve(book, [None])
                        resolved = resolveExceptin(self._tracker, book, [None])
                        resolveds.append(resolved)

                    if m == u'ו?שם':
                        m = None
                    elif re.search(reg_siman, m):
                        continue
                    elif getGematriaVav(m, mass):
                        m = getGematriaVav(m, mass)
                    else:
                        m = None
                    # resolved = self._tracker.resolve(book, [m])
                    resolved = resolveExceptin(self._tracker, book, [m])
                    resolveds.append(resolved)
        if not resolveds:
            # resolved = self._tracker.resolve(book, [None])
            resolved = resolveExceptin(self._tracker, book, [None])

            resolveds.append(resolved)
        if len([item for item in resolveds if not isinstance(item, Ref)]) > 0:
            mass.write_shgia(u'error from ibid in Ref or table none problem')
        return resolveds
Ejemplo n.º 25
0
 def _sub_symbols(self, line):
     while re.search(r'::\w+::', line):
         s = re.search(r'::\w+::', line).group(0)
         if s in self.symbols:
             line = line.replace(s, self.symbols[s])
         else:
             raise RuleFileError('Undefined symbol: {}'.format(s))
     return line
Ejemplo n.º 26
0
def is_blacklisted(part, group_name, blacklists):
    for blacklist in blacklists:
        if regex.search(blacklist.group_name, group_name):
            # too spammy
            # log.debug('{0}: Checking blacklist {1}...'.format(group_name, blacklist['regex']))
            if regex.search(blacklist.regex, part[blacklist.field]):
                return True
    return False
Ejemplo n.º 27
0
 def use_type_get_index(self, st):
     address_regex = CitationFinder.get_ultimate_title_regex(title=u"שם", node=None, lang='he')
     # address_regex = CitationFinder.create_or_address_regexes(lang='he')
     m = re.search(address_regex, st)
     for k, v in m.groupdict().items():
         if v and not re.search("Title|a0|a1", k):
             address_type = k
     return address_type
def seferHamitzvot_from_rasag_comm(rasagCsvName, with_orig = False):
        # ind_rasag_comm = library.get_index("Commentary on Sefer Hamitzvot of Rasag")
        segments = Ref('Commentary_on_Sefer_Hamitzvot_of_Rasag,_Positive_Commandments').all_segment_refs()
        segments.extend(Ref('Commentary_on_Sefer_Hamitzvot_of_Rasag,_Negative_Commandments').all_segment_refs())
        segments.extend(Ref('Commentary_on_Sefer_Hamitzvot_of_Rasag,_Laws_of_the_Courts').all_segment_refs())
        segments.extend(Ref('Commentary_on_Sefer_Hamitzvot_of_Rasag,_Communal_Laws').all_segment_refs())

        cnt = {"Rasag":0, "Sefer HaMitzvot":0, "Semag":0, "Semak":0}
        dict_list = []
        for seg in segments:
            # sfHmtzvot = re.search(u'(?:ספר המצו?ות|סה"מ).{1,4}(עשין|לאוין|עשה|לא תעשה).{0,20}', seg.text('he').text)
            sfHmtzvot = re.search(u'(?:ספר המצוות|סה"מ)\s{1,4}\((.*?)\)', seg.text('he').text)
            smg = re.search(u'סמ"ג \((.*?)\)', seg.text('he').text)
            smk = re.search(u'סמ"ק (\(.*?\))', seg.text('he').text)
            row_dict = {}
            row_orig = {}
            if sfHmtzvot:
                # row_dict["Rasag"] = re.search("(Sefer.*?\d*?):", seg.normal()).group(1)
                # row_orig["Rasag"] = re.search("(Sefer.*?\d*?):", seg.normal()).group(1)
                kind, simanim = rasag_exctractor(sfHmtzvot.group(1))
                # row_dict["Sefer HaMitzvot"] = ['Sefer HaMitzvot, {}.{}'.format(kind, siman) for siman in simanim]
                if kind:
                    row_dict["Sefer HaMitzvot"] = 'Sefer HaMitzvot, {}.{}'.format(kind, simanim[0])
                else:
                    print "no kind", sfHmtzvot.group(1)
                row_orig["Sefer HaMitzvot"] = sfHmtzvot.group()
                cnt["Sefer HaMitzvot"] += 1
            if smg:
                # row_dict["Rasag"] = re.search("(Sefer.*?\d*?):", seg.normal()).group(1)
                kind, simanim = rasag_exctractor(smg.group(1))
                # row_dict["Semag"] = ['Sefer Mitzvot Gadol, {}.{}'.format(kind, siman) for siman in simanim]
                if kind:
                    row_dict["Semag"] = 'Sefer Mitzvot Gadol, {}.{}'.format(kind, simanim[0])
                else:
                    print "no kind", smg.group(1)
                row_orig["Semag"] = smg.group()
                cnt["Semag"] += 1
            if smk:
                # row_dict["Rasag"] = re.search("(Sefer.*?\d*?):", seg.normal()).group(1)
                # simanim = siman_smk_exctractor(smk.group(1))
                smki = re.search(u"ב?סי'\s+(.*?)(?:\s*\))", smk.group(1))
                if smki:
                    siman = getGematria(smki.group(1))
                    row_dict["Semak"] = "Sefer Mitzvot Katan.{}".format(siman)
                    row_orig["Semak"] = smk.group()
                    cnt["Semak"] += 1
                else:
                    print u'***siman***' + smk.group()

            if row_dict:
                cnt["Rasag"] += 1
                row_dict["Rasag"] = re.search("(Sefer.*?\d*?):", seg.normal()).group(1)
                row_orig["Rasag"] = seg.normal()
                if with_orig:
                    dict_list.append(row_orig)
                dict_list.append(row_dict)
        toCsv(rasagCsvName, ["Rasag", "Sefer HaMitzvot", "Semag", "Semak"], dict_list)
        print cnt
Ejemplo n.º 29
0
	def evaluate(self,scope, locals, block=None):
		#self.data = re.sub('\A[\n\r\s]+','',self.data)
		blank_pattern = re.compile(r"""\A\s*\Z""",re.M)
		end_pattern = re.compile(r""";\s*\Z""",re.M)

		if re.search(blank_pattern,self.data) or re.search(end_pattern,self.data):
			return self.data #return "%s\n" % self.data if self.data != "" and not self.data.endswith('\n') else self.data			
		else:
			return "%s;\n" % self.data
def isYelling(message):
    isYelling = False
    if (re.search(u'[A-Z]', message)):
        if not (re.search(u'[a-z]', message)):
            if (re.compile(u'\p{Ll}', re.UNICODE).search(message)):
                ifYelling = False
            else:
                isYelling = True
    return isYelling
Ejemplo n.º 31
0
    def parse_implicit_date(self, source: str,
                            reference: datetime) -> DateTimeParseResult:
        trimmed_source = source.strip()
        result = DateTimeResolutionResult()

        # handle "on 12"
        match = regex.search(self.config.on_regex,
                             self.config.date_token_prefix + trimmed_source)
        if match and match.start() == len(
                self.config.date_token_prefix) and len(
                    match.group()) == len(trimmed_source):
            day = 0
            month = reference.month
            year = reference.year
            day_str = match.group('day')
            day = self.config.day_of_month.get(day_str)

            result.timex = DateTimeFormatUtil.luis_date(-1, -1, day)

            try_str = DateTimeFormatUtil.luis_date(year, month, day)
            try_date = datetime.strptime(try_str, '%Y-%m-%d')
            future_date: datetime
            past_date: datetime

            if try_date:
                future_date = DateUtils.safe_create_from_min_value(
                    year, month, day)
                past_date = DateUtils.safe_create_from_min_value(
                    year, month, day)

                if future_date < reference:
                    future_date += datedelta(months=1)

                if past_date >= reference:
                    past_date += datedelta(months=-1)
            else:
                future_date = DateUtils.safe_create_from_min_value(
                    year, month + 1, day)
                past_date = DateUtils.safe_create_from_min_value(
                    year, month - 1, day)

            result.future_value = future_date
            result.past_value = past_date
            result.success = True
            return result

        # handle "today", "the day before yesterday"
        match = regex.match(self.config.special_day_regex, trimmed_source)
        if match and match.start() == 0 and len(
                match.group()) == len(trimmed_source):
            swift = self.config.get_swift_day(match.group())
            value = reference + timedelta(days=swift)
            result.timex = DateTimeFormatUtil.luis_date_from_datetime(value)
            result.future_value = value
            result.past_value = value
            result.success = True
            return result

        # handle "next Sunday"
        match = regex.match(self.config.next_regex, trimmed_source)
        if match and match.start() == 0 and len(
                match.group()) == len(trimmed_source):
            weekday_str = match.group('weekday')
            value = DateUtils.next(reference,
                                   self.config.day_of_week.get(weekday_str))

            result.timex = DateTimeFormatUtil.luis_date_from_datetime(value)
            result.future_value = value
            result.past_value = value
            result.success = True
            return result

        # handle "this Friday"
        match = regex.match(self.config.this_regex, trimmed_source)
        if match and match.start() == 0 and len(
                match.group()) == len(trimmed_source):
            weekday_str = match.group('weekday')
            value = DateUtils.this(reference,
                                   self.config.day_of_week.get(weekday_str))

            result.timex = DateTimeFormatUtil.luis_date_from_datetime(value)
            result.future_value = value
            result.past_value = value
            result.success = True
            return result

        # handle "last Friday", "last mon"
        match = regex.match(self.config.last_regex, trimmed_source)
        if match and match.start() == 0 and len(
                match.group()) == len(trimmed_source):
            weekday_str = match.group('weekday')
            value = DateUtils.last(reference,
                                   self.config.day_of_week.get(weekday_str))

            result.timex = DateTimeFormatUtil.luis_date_from_datetime(value)
            result.future_value = value
            result.past_value = value
            result.success = True
            return result

        # handle "Friday"
        match = regex.match(self.config.week_day_regex, trimmed_source)
        if match and match.start() == 0 and len(
                match.group()) == len(trimmed_source):
            weekday_str = match.group('weekday')
            weekday = self.config.day_of_week.get(weekday_str)
            value = DateUtils.this(reference, weekday)

            if weekday < int(DayOfWeek.Monday):
                weekday = int(DayOfWeek.Sunday)

            if weekday < reference.isoweekday():
                value = DateUtils.next(reference, weekday)

            result.timex = 'XXXX-WXX-' + str(weekday)
            future_date = value
            past_date = value

            if future_date < reference:
                future_date += timedelta(weeks=1)

            if past_date >= reference:
                past_date -= timedelta(weeks=1)

            result.future_value = future_date
            result.past_value = past_date
            result.success = True
            return result

        # handle "for the 27th."
        match = regex.match(self.config.for_the_regex, trimmed_source)
        if match:
            day_str = match.group('DayOfMonth')
            er = ExtractResult.get_from_text(day_str)
            day = int(self.config.number_parser.parse(er).value)

            month = reference.month
            year = reference.year

            result.timex = DateTimeFormatUtil.luis_date(-1, -1, day)
            date = datetime(year, month, day)
            result.future_value = date
            result.past_value = date
            result.success = True

            return result

        # handling cases like 'Thursday the 21st', which both 'Thursday' and '21st' refer to a same date
        match = regex.match(self.config.week_day_and_day_of_month_regex,
                            trimmed_source)
        if match:
            day_str = match.group('DayOfMonth')
            er = ExtractResult.get_from_text(day_str)
            day = int(self.config.number_parser.parse(er).value)
            month = reference.month
            year = reference.year

            # the validity of the phrase is guaranteed in the Date Extractor
            result.timex = DateTimeFormatUtil.luis_date(year, month, day)
            date = datetime(year, month, day)
            result.future_value = date
            result.past_value = date
            result.success = True

            return result

        return result
 def OutputStudy(self, out_file, num_sections, has_next, main_words, extra_word_lists,
                 body_dbm, uniq_words, out_children):
   def P(*args, end="\n"):
     esc_args = []
     for arg in args[1:]:
       if isinstance(arg, str):
         arg = esc(arg)
       esc_args.append(arg)
     print(args[0].format(*esc_args), end=end, file=out_file)
   def PrintNavi():
     P('<div class="navi">')
     P('<a href="index.xhtml">TOP</a>')
     check_url = "check-{:03d}.xhtml".format(num_sections)
     P('<a href="{}">CHECK</a>', check_url)
     if num_sections == 1:
       P('<span class="void">PREV</span>')
     else:
       prev_url = "study-{:03d}.xhtml".format(num_sections - 1)
       P('<a href="{}">PREV</a>', prev_url)
     if has_next:
       next_url = "study-{:03d}.xhtml".format(num_sections + 1)
       P('<a href="{}">NEXT</a>', next_url)
     else:
       P('<span class="void">NEXT</span>')
     P('</div>')
   P('<?xml version="1.0" encoding="UTF-8"?>')
   P('<!DOCTYPE html>')
   P('<html xmlns="http://www.w3.org/1999/xhtml">')
   P('<head>')
   P('<meta charset="UTF-8"/>')
   P('<meta name="viewport" content="width=device-width"/>')
   P('<title>{}: Chapter {} Study</title>', self.title, num_sections)
   P('<link rel="stylesheet" href="style.css"/>')
   P('</head>')
   P('<body>')
   P('<article>')
   PrintNavi()
   P('<h1><a href="">Chapter {} Study</a></h1>', num_sections)
   num_words = 0
   for surface, aliases in main_words:
     entry = None
     data = body_dbm.GetStr(surface)
     if data:
       entries = json.loads(data)
       for word_entry in entries:
         if word_entry["word"] == surface:
           entry = word_entry
           break
     if not entry:
       P('<p>Warning: no data for {}</p>', surface)
       continue
     uniq_words[surface] = num_sections
     num_words += 1
     word_id = ConvertWordToID(surface)
     P('<section id="{}" class="entry">', word_id)
     P('<div class="num">{:02d}</div>', num_words)
     P('<div class="head">')
     P('<a href="#{}" class="word">{}</a>', word_id, surface)
     pron = entry.get("pronunciation")
     if pron:
       P('<span class="pron">{}</span>', pron)
     P('</div>', surface)
     trans = entry.get("translation")
     if trans:
       P('<div class="trans">{}</div>', ", ".join(trans[:8]))
     first_label = None
     num_items = 0
     poses = set()
     for item in entry["item"]:
       label = item["label"]
       pos = item["pos"]
       text = item["text"]
       if regex.search(r"^\[translation\]", text): continue
       if num_items >= 10: break
       if first_label and label != first_label:
         break
       first_label = label
       parts = []
       for part in text.split("[-]"):
         part = part.strip()
         parts.append(part)
       if not parts: continue
       num_items += 1
       main_text = CutTextByWidth(parts[0], 128)
       poses.add(pos)
       P('<div class="text">')
       P('<span class="attr">{}</span>', POSES.get(pos) or pos)
       P('<span>{}</span>', main_text)
       P('</div>')
       synonyms = []
       examples = []
       for part in parts[1:]:
         match = regex.search(r"\[synonym\]: (.*)", part)
         if match:
           synonyms.append(match.group(1).strip())
         match = regex.search(r"\e.g.: (.*)", part)
         if match:
           examples.append(match.group(1).strip())
       for text in synonyms:
         text = CutTextByWidth(text, 128)
         P('<div class="aux">')
         P('<span class="auxattr">≒</span>')
         P('<span>{}</span>', text)
         P('</div>')
       for text in examples[:1]:
         text = CutTextByWidth(text, 128)
         P('<div class="aux">')
         P('<span class="auxattr">・</span>')
         P('<span>{}</span>', text)
         P('</div>')
     parents = entry.get("parent")
     children = entry.get("child")
     sibling_alts = set((parents or []) + (children or []))
     if children:
       for child in list(children):
         child_data = body_dbm.GetStr(child)
         if not child_data: continue
         child_entries = json.loads(child_data)
         for child_entry in child_entries:
           if child_entry["word"] != child: continue
           grand_children = child_entry.get("child")
           if grand_children:
             for grand_child in grand_children:
               if grand_child not in children:
                 children.append(grand_child)
     phrases = entry.get("phrase") or []
     for label, derivatives in (("語幹", parents), ("派生", children)):
       if not derivatives: continue
       for child in derivatives:
         if not regex.search(r"^[a-zA-Z]", child): continue
         if child in uniq_words: continue
         uniq_words[child] = num_sections
         child_trans = None
         child_poses = None
         child_data = body_dbm.GetStr(child)
         if child_data:
           child_entries = json.loads(child_data)
           child_prob = 0
           for child_entry in child_entries:
             if child_entry["word"] != child: continue
             child_prob = float(child_entry.get("probability") or 0.0)
             us_hit = IsGbAlternative(child_entry, sibling_alts, body_dbm)
             if us_hit:
               continue
             child_trans = child_entry.get("translation")
             child_poses = self.GetEntryPOSList(child_entry)
             child_phrases = child_entry.get("phrase") or []
             if child_phrases:
               phrases.extend(child_phrases)
             break
         if not child_trans: continue
         if self.child_min_prob > 0 and child_prob < self.child_min_prob:
           continue
         P('<div class="child">')
         P('<span class="attr">{}</span>', label)
         for child_pos in child_poses[:2]:
           P('<span class="attr subattr">{}</span>', POSES.get(child_pos) or child_pos)
         child_id = ConvertWordToID(child)
         P('<span id="{}" class="subword">{}</span>', child_id, child_id, child)
         P('<span class="childtrans">: {}</span>', ", ".join(child_trans[:4]))
         P('</div>')
         out_children[surface].append((child, child_trans))
     if phrases:
       for phrase in phrases:
         if not phrase.get("i"): continue
         phrase_word = phrase.get("w")
         if not regex.search(r"^[a-zA-Z]", phrase_word): continue
         if phrase_word in uniq_words: continue
         uniq_words[phrase_word] = num_sections
         phrase_data = body_dbm.GetStr(phrase_word)
         if not phrase_data: continue
         phrase_entries = json.loads(phrase_data)
         phrase_trans = None
         phrase_poses = None
         phrase_prob = 0
         for phrase_entry in phrase_entries:
           if phrase_entry["word"] != phrase_word: continue
           phrase_prob = float(phrase_entry.get("probability") or 0.0)
           phrase_trans = phrase_entry.get("translation")
           phrase_poses = self.GetEntryPOSList(phrase_entry)
           break
         if not phrase_trans: continue
         if self.child_min_prob > 0 and phrase_prob < self.child_min_prob: continue
         P('<div class="child">')
         P('<span class="attr">句</span>')
         for phrase_pos in phrase_poses[:2]:
           P('<span class="attr subattr">{}</span>', POSES.get(phrase_pos) or phrase_pos)
         phrase_id = ConvertWordToID(phrase_word)
         P('<span href="#{}" id="{}" class="subword">{}</span>', phrase_id, phrase_id, phrase_word)
         P('<span class="childtrans">: {}</span>', ", ".join(phrase_trans[:4]))
         P('</div>')
     infls = []
     for name, label in INFLECTIONS:
       prefix = name[:name.find("_")]
       if prefix not in poses: continue
       value = entry.get(name)
       if not value: continue
       infls.append((label, value))
     uniq_infls = set()
     for alias in aliases:
       if alias in uniq_infls: continue
       uniq_infls.add(alias)
       infls.append(("代替", alias))
     alternatives = entry.get("alternative")
     if alternatives:
       for alt in alternatives:
         if alt in uniq_infls: continue
         uniq_infls.add(alt)
         infls.append(("代替", alt))
     if infls:
       P('<div class="meta">')
       for label, value in infls:
         P('<span class="attr">{}</span>', label)
         P('<span class="metavalue">{}</span>', value)
       P('</div>')
     P('</section>')
   extra_words = []
   for extra_word_list in extra_word_lists:
     num_extra_words = 0
     for extra_word in extra_word_list:
       if num_extra_words >= self.num_extra_items: break
       if extra_word in uniq_words: continue
       extra_trans = []
       extra_poses = []
       extra_data = body_dbm.GetStr(extra_word)
       if extra_data:
         extra_entries = json.loads(extra_data)
         for extra_entry in extra_entries:
           if extra_entry["word"] != extra_word: continue
           extra_trans.extend(extra_entry.get("translation") or [])
           extra_poses.extend(self.GetEntryPOSList(extra_entry))
       if not extra_trans: continue
       extra_trans = extra_trans[:5]
       has_good_tran = False
       for extra_tran in extra_trans:
         if regex.search(r"[\p{Han}\p{Hiragana}\p{Katakana}]", extra_tran):
           has_good_tran = True
       if not has_good_tran: continue
       extra_words.append((extra_word, extra_trans, extra_poses))
       uniq_words[extra_word] = num_sections
       num_extra_words += 1
   if extra_words:
     P('<section class="entry">')
     P('<div class="num">Bonus Words</div>')
     for extra_word, extra_trans, extra_poses in extra_words:
       P('<div class="extra">')
       word_id = ConvertWordToID(extra_word)
       P('<span id="{}" class="subword">{}</span> :', word_id, word_id, extra_word)
       for extra_pos in extra_poses[:2]:
         P('<span class="attr subattr">{}</span>', POSES.get(extra_pos) or extra_pos)
       P('<span class="childtrans">{}</span>', ", ".join(extra_trans))
       P('</div>')
     P('</section>')
   PrintNavi()
   P('</article>')
   P('</body>')
   P('</html>')
def regex_find(string, pattern):
    match = regex.search(pattern, string, flags=re.IGNORECASE)
    return match is not None
Ejemplo n.º 34
0
def is_hebrew(s):
    if regex.search(u"\p{Hebrew}", s):
        return True
    return False
Ejemplo n.º 35
0
        try:
            title = driver.find_element_by_xpath(f'//*[@id="body"]/div/div[2]/ul/li[{i}]/div/a/div[2]').text
            print(title)
        except:
          
            num+=1
            i=1
            continue
       
        for word in wordlist:
            if word in title:
                print("ERROR")
                ERROR = True
                break
            # check number
        if (re.search(r'\d', title)):
            print("NUMBER ERROR")
            ERROR = True
            
        # typ = driver.find_element_by_xpath(f'//*[@id="main-content"]/div/main/article[{i}]/div/div/p/a').text
        
        # if 'DINNER' not in typ:
        #     print('SIDE ERROR')
        #     ERROR = True
            
        if ERROR:
            i+=1
            continue
        
        
Ejemplo n.º 36
0
    def parse_implicit_date(self, source: str,
                            reference: datetime) -> DateTimeParseResult:
        trimmed_source = source.strip()
        result = DateTimeResolutionResult()

        # handle "on 12"
        match = regex.search(self.special_date_regex, trimmed_source)
        if match and len(match.group()) == len(trimmed_source):
            day = 0
            month = reference.month
            year = reference.year
            year_str = RegExpUtility.get_group(match, 'thisyear')
            month_str = RegExpUtility.get_group(match, 'thismonth')
            day_str = RegExpUtility.get_group(match, 'day')
            day = self.config.day_of_month.get(day_str, -1)

            has_year = year_str.strip() != ''
            has_month = month_str.strip() != ''

            if has_month:
                if regex.search(self.token_next_regex, month_str):
                    month += 1
                    if month == Constants.MaxMonth + 1:
                        month = Constants.MinMonth
                        year += 1
                elif regex.search(self.token_last_regex, month_str):
                    month -= 1
                    if month == Constants.MinMonth - 1:
                        month = Constants.MaxMonth
                        year -= 1

                if has_year:
                    if regex.search(self.token_next_regex, year_str):
                        year += 1
                    elif regex.search(self.token_last_regex, year_str):
                        year -= 1

            result.timex = DateTimeFormatUtil.luis_date(
                year if has_year else -1, month if has_month else -1, day)

            future_date: datetime
            past_date: datetime

            if day > self.get_month_max_day(year, month):
                futureMonth = month + 1
                pastMonth = month - 1
                futureYear = year
                pastYear = year

                if futureMonth == Constants.MaxMonth + 1:
                    futureMonth = Constants.MinMonth
                    futureYear = year + 1

                if pastMonth == Constants.MinMonth - 1:
                    pastMonth = Constants.MaxMonth
                    pastYear = year - 1

                isFutureValid = DateUtils.is_valid_date(
                    futureYear, futureMonth, day)
                isPastValid = DateUtils.is_valid_date(pastYear, pastMonth, day)

                if isFutureValid and isPastValid:
                    future_date = DateUtils.safe_create_from_min_value(
                        futureYear, futureMonth, day)
                    past_date = DateUtils.safe_create_from_min_value(
                        pastYear, pastMonth, day)
                elif isFutureValid and not isPastValid:
                    future_date = past_date = DateUtils.safe_create_from_min_value(
                        futureYear, futureMonth, day)
                elif not isFutureValid and not isPastValid:
                    future_date = past_date = DateUtils.safe_create_from_min_value(
                        pastYear, pastMonth, day)
                else:
                    future_date = past_date = DateUtils.safe_create_from_min_value(
                        year, month, day)
            else:
                future_date = DateUtils.safe_create_from_min_value(
                    year, month, day)
                past_date = DateUtils.safe_create_from_min_value(
                    year, month, day)

                if not has_month:
                    if future_date < reference:
                        if self.is_valid_date(year, month + 1, day):
                            future_date += datedelta(months=1)
                    if past_date >= reference:
                        if self.is_valid_date(year, month - 1, day):
                            past_date += datedelta(months=-1)
                        elif self.is_non_leap_year_Feb_29th(
                                year, month - 1, day):
                            past_date += datedelta(months=-2)
                elif has_month and not has_year:
                    if future_date < reference:
                        if self.is_valid_date(year + 1, month, day):
                            future_date += datedelta(years=1)
                    if past_date >= reference:
                        if self.is_valid_date(year - 1, month, day):
                            past_date += datedelta(years=-1)

            result.future_value = future_date
            result.past_value = past_date
            result.success = True
            return result

        # handle "today", "the day before yesterday"
        match = regex.match(self.config.special_day_regex, trimmed_source)
        if match and match.start() == 0 and len(
                match.group()) == len(trimmed_source):
            swift = self.config.get_swift_day(match.group())
            value = reference + timedelta(days=swift)

            result.timex = DateTimeFormatUtil.luis_date_from_datetime(value)
            result.future_value = value
            result.past_value = value
            result.success = True
            return result

        # handle "this Friday"
        match = regex.match(self.config.this_regex, trimmed_source)
        if match and match.start() == 0 and len(
                match.group()) == len(trimmed_source):
            weekday_str = RegExpUtility.get_group(match, 'weekday')
            value = DateUtils.this(reference,
                                   self.config.day_of_week.get(weekday_str))

            result.timex = DateTimeFormatUtil.luis_date_from_datetime(value)
            result.future_value = value
            result.past_value = value
            result.success = True
            return result

        # handle "next Sunday"
        match = regex.match(self.config.next_regex, trimmed_source)
        if match and match.start() == 0 and len(
                match.group()) == len(trimmed_source):
            weekday_str = RegExpUtility.get_group(match, 'weekday')
            value = DateUtils.next(reference,
                                   self.config.day_of_week.get(weekday_str))

            result.timex = DateTimeFormatUtil.luis_date_from_datetime(value)
            result.future_value = value
            result.past_value = value
            result.success = True
            return result

        # handle "last Friday", "last mon"
        match = regex.match(self.config.last_regex, trimmed_source)
        if match and match.start() == 0 and len(
                match.group()) == len(trimmed_source):
            weekday_str = RegExpUtility.get_group(match, 'weekday')
            value = DateUtils.last(reference,
                                   self.config.day_of_week.get(weekday_str))

            result.timex = DateTimeFormatUtil.luis_date_from_datetime(value)
            result.future_value = value
            result.past_value = value
            result.success = True
            return result

        # handle "Friday"
        match = regex.match(self.config.week_day_regex, trimmed_source)
        if match and match.start() == 0 and len(
                match.group()) == len(trimmed_source):
            weekday_str = RegExpUtility.get_group(match, 'weekday')
            weekday = self.config.day_of_week.get(weekday_str)
            value = DateUtils.this(reference, weekday)

            if weekday == 0:
                weekday = 7

            if weekday < reference.isoweekday():
                value = DateUtils.next(reference, weekday)

            result.timex = 'XXXX-WXX-' + str(weekday)
            future_date = value
            past_date = value

            if future_date < reference:
                future_date += timedelta(weeks=1)

            if past_date >= reference:
                past_date -= timedelta(weeks=1)

            result.future_value = future_date
            result.past_value = past_date
            result.success = True
            return result

        return result
Ejemplo n.º 37
0
 def should_skip_from_merge(self, source: ExtractResult) -> bool:
     return regex.search(self.config.from_to_regex, source.text)
Ejemplo n.º 38
0
def alignSequences(targetsite_sequence, window_sequence, max_score=7):

    window_sequence = window_sequence.upper()
    query_regex_standard, query_regex_gap = regexFromSequence(
        targetsite_sequence, errors=max_score)

    # Try both strands
    alignments_mm, alignments_bulge = list(), list()
    alignments_mm.append(('+', 'standard',
                          regex.search(query_regex_standard, window_sequence,
                                       regex.BESTMATCH)))
    alignments_mm.append(('-', 'standard',
                          regex.search(query_regex_standard,
                                       reverseComplement(window_sequence),
                                       regex.BESTMATCH)))
    alignments_bulge.append(('+', 'gapped',
                             regex.search(query_regex_gap, window_sequence,
                                          regex.BESTMATCH)))
    alignments_bulge.append(('-', 'gapped',
                             regex.search(query_regex_gap,
                                          reverseComplement(window_sequence),
                                          regex.BESTMATCH)))

    lowest_distance_score, lowest_mismatch = 100, max_score + 1
    chosen_alignment_b, chosen_alignment_m, chosen_alignment_strand_b, chosen_alignment_strand_m = None, None, '', ''

    # Use regex to find the best match allowing only for mismatches
    for aln_m in alignments_mm:
        strand_m, alignment_type_m, match_m = aln_m
        if match_m != None:
            mismatches, insertions, deletions = match_m.fuzzy_counts
            if mismatches < lowest_mismatch:
                chosen_alignment_m = match_m
                chosen_alignment_strand_m = strand_m
                lowest_mismatch = mismatches

    # Use regex to find the best match allowing for gaps, so that its edit distance is strictly lower than the
    # total number of mismatches of the sequence founded (if any) allowing only for mismatches.
    for aln_b in alignments_bulge:
        strand_b, alignment_type_b, match_b = aln_b
        if match_b != None:
            substitutions, insertions, deletions = match_b.fuzzy_counts
            if insertions or deletions:
                distance_score = substitutions + (insertions + deletions) * 3
                edistance = substitutions + insertions + deletions
                if distance_score < lowest_distance_score and edistance < lowest_mismatch:
                    chosen_alignment_b = match_b
                    chosen_alignment_strand_b = strand_b
                    lowest_distance_score = distance_score

    if chosen_alignment_m:
        offtarget_sequence_no_bulge = chosen_alignment_m.group()
        mismatches = chosen_alignment_m.fuzzy_counts[0]
        start_no_bulge = chosen_alignment_m.start()
        end_no_bulge = chosen_alignment_m.end()
    else:
        offtarget_sequence_no_bulge, mismatches, start_no_bulge, end_no_bulge, chosen_alignment_strand_m = '', '', '', '', ''
    bulged_offtarget_sequence, score, length, substitutions, insertions, deletions, bulged_start, bulged_end, realigned_target = \
        '', '', '', '', '', '', '', '', 'none'
    if chosen_alignment_b:
        realigned_target, bulged_offtarget_sequence = realignedSequences(
            targetsite_sequence, chosen_alignment_b, max_score)
        if bulged_offtarget_sequence:
            length = len(chosen_alignment_b.group())
            substitutions, insertions, deletions = chosen_alignment_b.fuzzy_counts
            score = substitutions + (insertions + deletions) * 3
            bulged_start = chosen_alignment_b.start()
            bulged_end = chosen_alignment_b.end()
        else:
            chosen_alignment_strand_b = ''

    return [
        offtarget_sequence_no_bulge, mismatches,
        len(offtarget_sequence_no_bulge), chosen_alignment_strand_m,
        start_no_bulge, end_no_bulge, realigned_target,
        bulged_offtarget_sequence, length, score, substitutions, insertions,
        deletions, chosen_alignment_strand_b, bulged_start, bulged_end
    ]
Ejemplo n.º 39
0
def find_date_separator(format):
    m = re.search(r'(?:(?:%[dbBmaA])(\W))+', format)
    if m:
        return m.group(1)
Ejemplo n.º 40
0
 def is_group_price(self, string):
     string = string.strip()
     rex = r'\dF\d\d?\d?'
     if regex.search(rex, string):
         return True
     return False
Ejemplo n.º 41
0
 def Run(self):
     start_time = time.time()
     logger.info("Process started: input_path={}, output_path={}".format(
         self.input_path, self.output_path))
     mem_index = tkrzw.DBM()
     mem_index.Open("", True, dbm="BabyDBM").OrDie()
     input_dbm = tkrzw.DBM()
     input_dbm.Open(self.input_path, False, dbm="HashDBM").OrDie()
     it = input_dbm.MakeIterator()
     it.First()
     num_entries = 0
     num_translations = 0
     while True:
         record = it.GetStr()
         if not record: break
         key, serialized = record
         entry = json.loads(serialized)
         for word_entry in entry:
             prob = max(float(word_entry.get("probability") or "0"),
                        0.0000001)
             aoa = min(float(word_entry.get("aoa") or "20"), 20.0)
             score = prob * ((30 - aoa) / 10)
             word_trans = word_entry.get("translation") or []
             dup_word_trans = word_trans
             for word_tran in word_trans:
                 match = regex.search(
                     r"([\p{Han}\p{Katakana}ー]{2,})(する|すること|される|されること|をする)$",
                     word_tran)
                 if match:
                     short_word_tran = word_tran[:-len(match.group(2))]
                     if short_word_tran:
                         dup_word_trans.append(short_word_tran)
                 short_word_tran = self.tokenizer.CutJaWordNounParticle(
                     word_tran)
                 if short_word_tran != word_tran:
                     dup_word_trans.append(short_word_tran)
                 match = regex.search(
                     r"([\p{Han}\p{Katakana}ー]{2,})(的|的な|的に)$", word_tran)
                 if match:
                     short_word_tran = word_tran[:-len(match.group(2))]
                     if short_word_tran:
                         dup_word_trans.append(short_word_tran)
                 match = regex.search(
                     r"([\p{Han}]{2,})(が|の|を|に|へ|と|より|から|で|や|な|なる|たる)$",
                     word_tran)
                 if match:
                     short_word_tran = word_tran[:-len(match.group(2))]
                     if short_word_tran:
                         dup_word_trans.append(short_word_tran)
             uniq_trans = set()
             for tran in dup_word_trans:
                 norm_tran = tkrzw_dict.NormalizeWord(tran)
                 if norm_tran in uniq_trans: continue
                 uniq_trans.add(norm_tran)
                 pair = "{}\t{:.8f}".format(key, score)
                 score *= 0.98
                 mem_index.Append(norm_tran, pair, "\t").OrDie()
             num_translations += len(uniq_trans)
         num_entries += 1
         if num_entries % 10000 == 0:
             logger.info("Reading: entries={}, translations={}".format(
                 num_entries, num_translations))
         it.Next()
     input_dbm.Close().OrDie()
     logger.info("Reading done: entries={}, translations={}".format(
         num_entries, num_translations))
     output_dbm = tkrzw.DBM()
     num_buckets = mem_index.Count() * 2
     output_dbm.Open(self.output_path,
                     True,
                     dbm="HashDBM",
                     truncate=True,
                     align_pow=0,
                     num_buckets=num_buckets).OrDie()
     tran_prob_dbm = None
     if self.tran_prob_path:
         tran_prob_dbm = tkrzw.DBM()
         tran_prob_dbm.Open(self.tran_prob_path, False,
                            dbm="HashDBM").OrDie()
     it = mem_index.MakeIterator()
     it.First()
     num_records = 0
     while True:
         record = it.GetStr()
         if not record: break
         key, value = record
         scored_trans = []
         uniq_words = set()
         fields = value.split("\t")
         for i in range(0, len(fields), 2):
             word = fields[i]
             score = float(fields[i + 1])
             if word in uniq_words: continue
             uniq_words.add(word)
             if tran_prob_dbm:
                 prob = self.GetTranProb(tran_prob_dbm, word, key)
                 score = (score * max(prob, 0.000001))**0.5
             scored_trans.append((word, score))
         scored_trans = sorted(scored_trans,
                               key=lambda x: x[1],
                               reverse=True)
         value = "\t".join([x[0] for x in scored_trans])
         output_dbm.Set(key, value).OrDie()
         num_records += 1
         if num_records % 10000 == 0:
             logger.info("Writing: records={}".format(num_records))
         it.Next()
     if tran_prob_dbm:
         tran_prob_dbm.Close().OrDie()
     output_dbm.Close().OrDie()
     logger.info("Writing done: records={}".format(num_records))
     mem_index.Close().OrDie()
     logger.info("Process done: elapsed_time={:.2f}s".format(time.time() -
                                                             start_time))
Ejemplo n.º 42
0
for item in p:
    # print(type(item), item.name, item.get('class'), item.get('id'))
    # print(item.text)
    if item.name == 'dl':
        maths = item.findAll('math')
        for math in maths:
            text += math.get('alttext')[15:-1] + '\n'
    else:
        math_elements = item.findAll(class_='mwe-math-element')
        if math_elements:
            matches = regex.findall(' \\n\\n[−\\/\\(\\)\\|\\=\\+\\-A-Z0-9a-z\s\\p{Greek}]+{\\\displaystyle .+}', item.text)
            temp2 = ''
            from_counter = 0
            for match in matches:
                temp = item.text[from_counter:item.text.find(match)]
                temp += ' ' + regex.search('{\\\displaystyle .+}', match)[0][15:-1]
                from_counter = (item.text.find(match) + len(match) + 2)
                temp2 += temp
            temp2 += item.text[from_counter:]
            text += temp2
        else:
            text += item.text

print('Downloading images...')
images = html_soup.findAll('img', {'src':regex.compile('[.jpg|.png]')})
image_url_list = []
for image in images:
    if image.get('src').startswith('/static'):
        images.remove(image)
    elif 'https:' not in image.get('src'):
        image_url_list.append('https:' + image.get('src'))
Ejemplo n.º 43
0
def rok_zgonu(x):
    try:
        return int(
            re.search('(?<=\- ca |\-ca |\-ok\. |\-|po )(\d+)', x).group(0))
    except (TypeError, AttributeError):
        return None
# The function match() only compares the
# first word in a string
if regex.match(pattern1, reg_string1):
    print("You found a match!")
else:
    print("Bummer. No match.")

if regex.match(pattern1, reg_string2):
    print("You found a match!")
else:
    print("Bummer. No match.")

#=================================
# Second example, using search()
#=================================
if regex.search(pattern1, reg_string1):
    print("You found a match!")
else:
    print("Bummer. No match.")

if regex.search(pattern1, reg_string2):
    print("You found a match!")
else:
    print("Bummer. No match.")

#===========================================
# Comparing functions match() and search()
#===========================================
matches1 = regex.match(pattern1, reg_string1)
matches2 = regex.match(pattern1, reg_string2)
matches3 = regex.search(pattern1, reg_string1)
Ejemplo n.º 45
0
#! /usr/bin/env python
# Fix Python source files to avoid using
#       def method(self, (arg1, ..., argn)):
# instead of the more rational
#       def method(self, arg1, ..., argn):
#
# Command line arguments are files or directories to be processed.
# Directories are searched recursively for files whose name looks
# like a python module.
# Symbolic links are always ignored (except as explicit directory
# arguments).  Of course, the original file is kept as a back-up
# (with a "~" attached to its name).
# It complains about binaries (files containing null bytes)
# and about files that are ostensibly not Python files: if the first
# line starts with '#!' and does not contain the string 'python'.
#
# Changes made are reported to stdout in a diff-like format.
#
# Undoubtedly you can do this using find and sed or perl, but this is
# a nice example of Python code that recurses down a directory tree
# and uses regular expressions.  Also note several subtleties like
# preserving the file's mode and avoiding to even write a temp file
# when no changes are needed for a file.
#
# NB: by changing only the function fixline() you can turn this
# into a program for a different change to Python programs...
import sys
import regex
import os
from stat import *
Ejemplo n.º 46
0
    def _frac_like_number_parse(self,
                                ext_result: ExtractResult) -> ParseResult:
        result = ParseResult()
        result.start = ext_result.start
        result.length = ext_result.length
        result.text = ext_result.text
        result.type = ext_result.type

        result_text = ext_result.text.lower()
        if regex.search(self.config.fraction_marker_token, result_text):
            over_index = result_text.find(self.config.fraction_marker_token)
            small_part = result_text[0:over_index].strip()
            big_part = result_text[over_index +
                                   len(self.config.fraction_marker_token
                                       ):len(result_text)].strip()
            small_value = self._get_digital_value(
                small_part, 1) if self._is_digit(
                    small_part[0]) else self.__get_int_value(
                        self.__get_matches(small_part))
            big_value = self._get_digital_value(big_part, 1) if self._is_digit(
                big_part[0]) else self.__get_int_value(
                    self.__get_matches(big_part))

            result.value = small_value / big_value
        else:
            words = list(filter(lambda x: x, result_text.split(' ')))
            frac_words = self.config.normalize_token_set(words, result)

            # Split fraction with integer
            split_index = len(frac_words) - 1
            current_value = self.config.resolve_composite_number(
                frac_words[split_index])
            round_value = 1

            for split_index in range(len(frac_words) - 2, -1, -1):
                if (frac_words[split_index]
                        in self.config.written_fraction_separator_texts
                        or frac_words[split_index]
                        in self.config.written_integer_separator_texts):
                    continue
                previous_value = current_value
                current_value = self.config.resolve_composite_number(
                    frac_words[split_index])

                sm_hundreds = 100

                # previous: hundred
                # current: one
                if ((previous_value >= sm_hundreds
                     and previous_value > current_value) or
                    (previous_value < sm_hundreds
                     and self.__is_composable(current_value, previous_value))):
                    if (previous_value < sm_hundreds
                            and current_value >= round_value):
                        round_value = current_value
                    elif (previous_value < sm_hundreds
                          and current_value < round_value):
                        split_index += 1
                        break

                    # current is the first word
                    if split_index == 0:
                        # scan, skip the first word
                        split_index = 1
                        while split_index <= len(frac_words) - 2:
                            # e.g. one hundred thousand
                            # frac[i+1] % 100 and frac[i] % 100 = 0
                            if (self.config.resolve_composite_number(
                                    frac_words[split_index]) >= sm_hundreds
                                    and not frac_words[split_index + 1] in self
                                    .config.written_fraction_separator_texts
                                    and self.config.resolve_composite_number(
                                        frac_words[split_index + 1]) <
                                    sm_hundreds):
                                split_index += 1
                                break
                            split_index += 1
                        break
                    continue
                split_index += 1
                break

            frac_part = []
            for i in range(split_index, len(frac_words)):
                if frac_words[i].find('-') > -1:
                    split = frac_words[i].split('-')
                    frac_part.append(split[0])
                    frac_part.append('-')
                    frac_part.append(split[1])
                else:
                    frac_part.append(frac_words[i])

            frac_words = frac_words[:split_index]

            # denomi = denominator
            denomi_value = self.__get_int_value(frac_part)
            # Split mixed number with fraction
            numer_value = 0
            int_value = 0

            mixed_index = len(frac_words)
            for i in range(len(frac_words) - 1, -1, -1):
                if (i < len(frac_words) - 1 and frac_words[i]
                        in self.config.written_fraction_separator_texts):
                    numer_str = ' '.join(frac_words[i + 1:len(frac_words)])
                    numer_value = self.__get_int_value(
                        self.__get_matches(numer_str))
                    mixed_index = i + 1
                    break

            int_str = ' '.join(frac_words[0:mixed_index])
            int_value = self.__get_int_value(self.__get_matches(int_str))

            # Find mixed number
            if (mixed_index != len(frac_words) and numer_value < denomi_value):
                # int_value + numer_value / denomi_value
                result.value = int_value + numer_value / denomi_value
            else:
                # (int_value + numer_value) / denomi_value
                result.value = (int_value + numer_value) / denomi_value

            # Convert to float for fixed float point vs. exponential notation consistency /w C#/TS/JS
            result.value = float(result.value)
        return result
Ejemplo n.º 47
0
 def AppendTranslations(self, wnjpn_trans, feedback_trans,
                        aux_trans, subaux_trans, tran_thes, synset_index, tran_index):
   start_time = time.time()
   logger.info("Appending translations: input_path={}, output_path={}".format(
     self.input_path, self.output_path))
   input_dbm = tkrzw.DBM()
   input_dbm.Open(self.input_path, False, dbm="HashDBM").OrDie()
   phrase_prob_dbm = None
   if self.phrase_prob_path:
     phrase_prob_dbm = tkrzw.DBM()
     phrase_prob_dbm.Open(self.phrase_prob_path, False, dbm="HashDBM").OrDie()
   rev_prob_dbm = None
   if self.rev_prob_path:
     rev_prob_dbm = tkrzw.DBM()
     rev_prob_dbm.Open(self.rev_prob_path, False, dbm="HashDBM").OrDie()
   tokenizer = tkrzw_tokenizer.Tokenizer()
   tran_prob_dbm =None
   if self.tran_prob_path:
     tran_prob_dbm = tkrzw.DBM()
     tran_prob_dbm.Open(self.tran_prob_path, False, dbm="HashDBM").OrDie()
   output_dbm = tkrzw.DBM()
   num_buckets = input_dbm.Count() * 2
   output_dbm.Open(
     self.output_path, True, dbm="HashDBM", truncate=True,
     align_pow=0, num_buckets=num_buckets).OrDie()
   num_words = 0
   num_orig_trans = 0
   num_match_trans = 0
   num_voted_trans = 0
   num_borrowed_trans = 0
   num_items = 0
   num_items_bare = 0
   num_items_rescued = 0
   it = input_dbm.MakeIterator()
   it.First()
   while True:
     record = it.GetStr()
     if not record: break
     key, serialized = record
     entry = json.loads(serialized)
     items = entry["item"]
     spell_ratios = {}
     for item in items:
       word = item["word"]
       phrase_prob = float(item.get("prob") or 0.0)
       spell_ratios[word] = phrase_prob + 0.00000001
     sum_prob = 0.0
     for word, prob in spell_ratios.items():
       sum_prob += prob
     for word, prob in list(spell_ratios.items()):
       spell_ratios[word] = prob / sum_prob
     all_tran_probs = tran_index.get(word) or {}
     for item in items:
       attrs = ["translation", "synonym", "antonym", "hypernym", "hyponym",
                "similar", "derivative"]
       for attr in attrs:
         rel_words = item.get(attr)
         if rel_words:
           rel_words = self.SortRelatedWords(
             rel_words, all_tran_probs, tokenizer, phrase_prob_dbm, tran_prob_dbm,
             synset_index, tran_index)
           item[attr] = rel_words
     for item in items:
       word = item["word"]
       pos = item["pos"]
       synset = item["synset"]
       links = item.get("link") or {}
       phrase_prob = float(item.get("prob") or 0.0)
       spell_ratio = spell_ratios[word]
       synonyms = item.get("synonym") or []
       hypernyms = item.get("hypernym") or []
       hyponyms = item.get("hyponym") or []
       similars = item.get("similar") or []
       derivatives = item.get("derivative") or []
       synonym_ids = links.get("synonym") or []
       hypernym_ids = links.get("hypernym") or []
       hyponym_ids = links.get("hyponym") or []
       similar_ids = links.get("similar") or []
       derivative_ids = links.get("derivative") or []
       item_tran_pairs = wnjpn_trans.get(synset) or []
       item_aux_trans = list(aux_trans.get(word) or [])
       ext_item_aux_trans = list(item_aux_trans)
       ext_item_aux_trans.extend(subaux_trans.get(word) or [])
       self.NormalizeTranslationList(tokenizer, pos, item_aux_trans)
       self.NormalizeTranslationList(tokenizer, pos, ext_item_aux_trans)
       scored_item_trans = collections.defaultdict(float)
       hand_trans = set()
       for tran, src in item_tran_pairs:
         if src == "mono":
           hit = False
           for item_aux_tran in ext_item_aux_trans:
             dist = tkrzw.Utility.EditDistanceLev(tran, item_aux_tran)
             dist_ratio = dist / max(len(tran), len(item_aux_tran))
             if dist < 0.3:
               hit = True
           if not hit:
             continue
         tran = tokenizer.NormalizeJaWordForPos(pos, tran)
         scored_item_trans[tran] = 1.0
         if src == "hand":
           hand_trans.add(tran)
       if feedback_trans:
         item_fb_trans = feedback_trans.get(word + ":" + synset) or []
         if item_fb_trans:
           for tran in item_fb_trans:
             tran = tokenizer.NormalizeJaWordForPos(pos, tran)
             if tran not in scored_item_trans:
               scored_item_trans[tran] = 0.9
       for tran, score in list(scored_item_trans.items()):
         if score != 1.0: continue
         cmp_words = tran_thes.get(tran)
         if cmp_words:
           for cmp_word in cmp_words:
             if cmp_word not in scored_item_trans:
               scored_item_trans[cmp_word] = 0.5
       num_items += 1
       bare = not scored_item_trans
       if bare:
         num_items_bare += 1
       num_orig_trans += len(scored_item_trans)
       syno_tran_counts = collections.defaultdict(int)
       hyper_tran_counts = collections.defaultdict(int)
       hypo_tran_counts = collections.defaultdict(int)
       similar_tran_counts = collections.defaultdict(int)
       derivative_tran_counts = collections.defaultdict(int)
       aux_trans_set = set(ext_item_aux_trans)
       checked_words = set()
       checked_ids = set([synset])
       adopted_rel_trans = set()
       voted_rel_words = set()
       voted_rel_records = set()
       for rel_words, rel_ids, tran_counts in (
           (synonyms, synonym_ids, syno_tran_counts),
           (hypernyms, hypernym_ids, hyper_tran_counts),
           (hyponyms, hyponym_ids, hypo_tran_counts),
           (similars, similar_ids, similar_tran_counts),
           (derivatives, derivative_ids, derivative_tran_counts)):
         for rel_word in rel_words:
           is_similar = self.AreSimilarWords(rel_word, word)
           rel_phrase_prob = 0.0
           if phrase_prob_dbm:
             rel_phrase_prob = self.GetPhraseProb(phrase_prob_dbm, tokenizer, "en", rel_word)
           mean_prob = (phrase_prob * rel_phrase_prob) ** 0.5
           rel_aux_trans = []
           if rel_word not in checked_words:
             checked_words.add(rel_word)
             tmp_aux_trans = aux_trans.get(rel_word)
             if tmp_aux_trans:
               rel_aux_trans.extend(tmp_aux_trans)
           for rel_id in synset_index[rel_word]:
             if rel_id not in rel_ids: continue
             if rel_id not in checked_ids:
               checked_ids.add(rel_id)
               tmp_aux_trans = wnjpn_trans.get(rel_id)
               if tmp_aux_trans:
                 tmp_aux_trans = [x[0] for x in tmp_aux_trans]
                 rel_aux_trans.extend(tmp_aux_trans)
           if rel_aux_trans:
             self.NormalizeTranslationList(tokenizer, pos, rel_aux_trans)
             if not is_similar and mean_prob < 0.0005:
               for item_aux_tran in ext_item_aux_trans:
                 if regex.fullmatch(r"[\p{Hiragana}]{,3}", item_aux_tran): continue
                 if item_aux_tran in rel_aux_trans:
                   if self.IsValidPosTran(tokenizer, pos, item_aux_tran):
                     adopted_rel_trans.add(item_aux_tran)
             if mean_prob < 0.005:
               voted_top = rel_word
               for voted_rel_word in voted_rel_words:
                 if self.AreSimilarWords(rel_word, voted_rel_word):
                   voted_top = voted_rel_word
                   break
               voted_rel_words.add(rel_word)
               for rel_aux_tran in set(rel_aux_trans):
                 voted_record = (voted_top, rel_aux_tran)
                 if voted_record in voted_rel_records:
                   continue
                 voted_rel_records.add(voted_record)
                 tran_counts[rel_aux_tran] += 1
       for rel_tran in adopted_rel_trans:
         scored_item_trans[rel_tran] = max(0.8, scored_item_trans[rel_tran] + 0.25)
         num_match_trans += 1
       if bare:
         for deri_tran, count in derivative_tran_counts.items():
           syno_tran_counts[deri_tran] = syno_tran_counts[deri_tran] + count
         derivative_tran_counts.clear()
       adopted_syno_trans = set()
       for syno_tran, count in syno_tran_counts.items():
         if regex.fullmatch(r"[\p{Hiragana}]{,3}", syno_tran): continue
         if syno_tran in hyper_tran_counts: count += 1
         if syno_tran in hypo_tran_counts: count += 1
         if syno_tran in similar_tran_counts: count += 1
         if syno_tran in derivative_tran_counts: count += 1
         if syno_tran in aux_trans_set: count += 1
         if count >= 3 and self.IsValidPosTran(tokenizer, pos, syno_tran):
           adopted_syno_trans.add(syno_tran)
       for syno_tran in adopted_syno_trans:
         scored_item_trans[syno_tran] = max(0.8, scored_item_trans[syno_tran] + 0.25)
         num_voted_trans += 1
       if item_aux_trans:
         aux_scores = {}
         for syno_tran, count in syno_tran_counts.items():
           if count < math.ceil(len(synonyms) * 2 / 3): continue
           if len(syno_tran) < 2: continue
           if not regex.search(r"\p{Han}[\p{Han}\p{Hiragana}]", syno_tran): continue
           for aux_tran in item_aux_trans:
             if aux_tran.find(syno_tran) >= 0 and self.IsValidPosTran(tokenizer, pos, aux_tran):
               weight = 0.25 if aux_tran == syno_tran else 0.2
               aux_scores[aux_tran] = max(aux_scores.get(aux_tran) or 0.0, weight)
         for hyper_tran, count in hyper_tran_counts.items():
           if count < math.ceil(len(hypernyms) * 2 / 3): continue
           if len(hyper_tran) < 2: continue
           if not regex.search(r"\p{Han}[\p{Han}\p{Hiragana}]", hyper_tran): continue
           for aux_tran in item_aux_trans:
             if aux_tran.find(hyper_tran) >= 0 and self.IsValidPosTran(tokenizer, pos, aux_tran):
               weight = 0.25 if aux_tran == hyper_tran else 0.2
               aux_scores[aux_tran] = max(aux_scores.get(aux_tran) or 0.0, weight)
         for aux_tran, score in aux_scores.items():
           scored_item_trans[aux_tran] = scored_item_trans[aux_tran] + score
           num_borrowed_trans += 1
       item_score = 0.0
       if scored_item_trans:
         scored_item_trans = scored_item_trans.items()
         if bare:
           num_items_rescued += 1
         if rev_prob_dbm or tran_prob_dbm:
           sorted_item_trans, item_score, tran_scores = (self.SortWordsByScore(
             word, pos, scored_item_trans, hand_trans, rev_prob_dbm, tokenizer, tran_prob_dbm))
         else:
           scored_item_trans = sorted(scored_item_trans, key=lambda x: x[1], reverse=True)
           sorted_item_trans = [x[0] for x in scored_item_trans]
         final_item_trans = []
         uniq_item_trans = set()
         for tran in sorted_item_trans:
           tran = regex.sub(r"^を.*", "", tran)
           tran = regex.sub(r"・", "", tran)
           if not tran or tran in uniq_item_trans: continue
           uniq_item_trans.add(tran)
           final_item_trans.append(tran)
         item["translation"] = final_item_trans[:MAX_TRANSLATIONS_PER_WORD]
         if tran_scores:
           tran_score_map = {}
           for tran, tran_score in tran_scores[:MAX_TRANSLATIONS_PER_WORD]:
             tran = regex.sub(r"^を.*", "", tran)
             tran = regex.sub(r"・", "", tran)
             if tran and tran not in tran_score_map:
               tran_score_map[tran] = "{:.6f}".format(tran_score).replace("0.", ".")
           item["translation_score"] = tran_score_map
       item_score += spell_ratio * 0.5
       item["score"] = "{:.8f}".format(item_score).replace("0.", ".")
       if "link" in item:
         del item["link"]
     if rev_prob_dbm:
       entry["item"] = sorted(
         items, key=lambda item: float(item.get("score") or 0.0), reverse=True)
     serialized = json.dumps(entry, separators=(",", ":"), ensure_ascii=False)
     output_dbm.Set(key, serialized).OrDie()
     num_words += 1
     if num_words % 1000 == 0:
       logger.info("Saving words: words={}".format(num_words))
     it.Next()
   output_dbm.Close().OrDie()
   if tran_prob_dbm:
     tran_prob_dbm.Close().OrDie()
   if rev_prob_dbm:
     rev_prob_dbm.Close().OrDie()
   if phrase_prob_dbm:
     phrase_prob_dbm.Close().OrDie()
   input_dbm.Close().OrDie()
   logger.info(
     "Aappending translations done: words={}, elapsed_time={:.2f}s".format(
       num_words, time.time() - start_time))
   logger.info(("Stats: orig={}, match={}, voted={}, borrowed={}" +
                ", items={}, bare={}, rescued={}").format(
     num_orig_trans, num_match_trans, num_voted_trans, num_borrowed_trans,
     num_items, num_items_bare, num_items_rescued))
Ejemplo n.º 48
0
 def check_if_part_of_date(self, text_body):
     rex = r'^(\d{4}|\d{2})(-|-\d{2})?(-|-\d{2})?$'
     if regex.search(rex, text_body):
         return True
     return False
Ejemplo n.º 49
0
def main(img):
    osd = pytesseract.image_to_osd(img)
    angle = re.search("(?<=Rotate: )\d+", osd).group(0)
    script = re.search("(?<=Script: )\d+", osd).group(0)
    print("angle: ", angle)
    print("script: ", script)
Ejemplo n.º 50
0
    def _eval(self, context: RuleContext) -> Optional[LintResult]:
        """Do not use special characters in object names."""
        # Config type hints
        self.quoted_identifiers_policy: str
        self.unquoted_identifiers_policy: str
        self.allow_space_in_identifier: bool
        self.additional_allowed_characters: str
        self.ignore_words: str
        self.ignore_words_regex: str

        # Confirm it's a single identifier.
        assert context.segment.is_type("naked_identifier", "quoted_identifier")

        # Get the ignore_words_list configuration.
        try:
            ignore_words_list = self.ignore_words_list
        except AttributeError:
            # First-time only, read the settings from configuration. This is
            # very slow.
            ignore_words_list = self._init_ignore_words_list()

        # Assume unquoted (we'll update if quoted)
        policy = self.unquoted_identifiers_policy

        identifier = context.segment.raw

        # Skip if in ignore list
        if ignore_words_list and identifier.lower() in ignore_words_list:
            return None

        # Skip if matches ignore regex
        if self.ignore_words_regex and regex.search(
            self.ignore_words_regex, identifier
        ):
            return LintResult(memory=context.memory)

        # Do some extra processing for quoted identifiers.
        if context.segment.is_type("quoted_identifier"):

            # Update the default policy to quoted
            policy = self.quoted_identifiers_policy

            # Strip the quotes first
            identifier = context.segment.raw[1:-1]

            # Skip if in ignore list - repeat check now we've strip the quotes
            if ignore_words_list and identifier.lower() in ignore_words_list:
                return None

            # Skip if matches ignore regex - repeat check now we've strip the quotes
            if self.ignore_words_regex and regex.search(
                self.ignore_words_regex, identifier
            ):
                return LintResult(memory=context.memory)

            # BigQuery table references are quoted in back ticks so allow dots
            #
            # It also allows a star at the end of table_references for wildcards
            # (https://cloud.google.com/bigquery/docs/querying-wildcard-tables)
            #
            # Strip both out before testing the identifier
            if (
                context.dialect.name in ["bigquery"]
                and context.parent_stack
                and context.parent_stack[-1].is_type("table_reference")
            ):
                if identifier[-1] == "*":
                    identifier = identifier[:-1]
                identifier = identifier.replace(".", "")

            # SparkSQL file references for direct file query
            # are quoted in back ticks to allow for identfiers common
            # in file paths and regex patterns for path globbing
            # https://spark.apache.org/docs/latest/sql-ref-syntax-qry-select-file.html
            #
            # Path Glob Filters (done inline for SQL direct file query)
            # https://spark.apache.org/docs/latest/sql-data-sources-generic-options.html#path-global-filter
            #

            if context.dialect.name in ["sparksql"] and context.parent_stack:

                # SparkSQL file references for direct file query
                # are quoted in back ticks to allow for identfiers common
                # in file paths and regex patterns for path globbing
                # https://spark.apache.org/docs/latest/sql-ref-syntax-qry-select-file.html
                #
                # Path Glob Filters (done inline for SQL direct file query)
                # https://spark.apache.org/docs/latest/sql-data-sources-generic-options.html#path-global-filter
                #
                if context.parent_stack[-1].is_type("file_reference"):
                    return None

                # SparkSQL properties keys used for setting table and runtime
                # configurations denote namespace using dots, so these are
                # removed before testing L057 to not trigger false positives
                # Runtime configurations:
                # https://spark.apache.org/docs/latest/configuration.html#application-properties
                # Example configurations for table:
                # https://spark.apache.org/docs/latest/sql-data-sources-parquet.html#configuration
                #
                if context.parent_stack[-1].is_type("property_name_identifier"):
                    identifier = identifier.replace(".", "")

            # Strip spaces if allowed (note a separate config as only valid for quoted
            # identifiers)
            if self.allow_space_in_identifier:
                identifier = identifier.replace(" ", "")

        # We always allow underscores so strip them out
        identifier = identifier.replace("_", "")

        # redshift allows a # at the beginning of temporary table names
        if (
            context.dialect.name == "redshift"
            and identifier[0] == "#"
            and context.parent_stack
            and context.parent_stack[-1].is_type("table_reference")
        ):
            identifier = identifier[1:]

        # Set the identified minus the allowed characters
        additional_allowed_characters = self._get_additional_allowed_characters(
            context.dialect.name
        )
        if additional_allowed_characters:
            identifier = identifier.translate(
                str.maketrans("", "", additional_allowed_characters)
            )

        # Finally test if the remaining identifier is only made up of alphanumerics
        if identifiers_policy_applicable(policy, context.parent_stack) and not (
            identifier.isalnum()
        ):
            return LintResult(anchor=context.segment)

        return None
  def PrepareSection(self, clusters, num_sections, has_next,
                     uniq_words, body_dbm, phrase_dbm, vetted_words, out_words, out_children):
    section_main_words = []
    section_extra_word_lists = []
    for cluster in clusters:
      main_words = []
      skipped_words = []
      dedup_words = []
      aliases = collections.defaultdict(set)
      num_words = len(cluster[0])
      local_uniq_words = set()
      for word in cluster[0]:
        data = body_dbm.GetStr(word)
        if not data:
          continue
        is_dup = word in uniq_words or word in local_uniq_words
        entries = json.loads(data)
        for entry in entries:
          if entry["word"] != word: continue
          if is_dup:
            is_verb = False
            for item in entry["item"]:
              if item["label"] == "wn" and item["pos"] == "verb":
                is_verb = True
            has_verb_inflection = entry.get("verb_present_participle")
            if not is_verb or not has_verb_inflection:
              continue
          parents = entry.get("parent")
          children = entry.get("child")
          for derivatives in (parents, children):
            if derivatives:
              for derivative in derivatives:
                local_uniq_words.add(derivative)
          trans = entry.get("translation")
          if not trans:
            continue
          has_good_tran = False
          for tran in trans[:6]:
            if regex.search(r"[\p{Han}\p{Hiragana}]", tran):
              has_good_tran = True
              break
          if not has_good_tran:
            skipped_words.append(word)
            continue
          count_synonyms = collections.defaultdict(int)
          num_items = 0
          for item in entry["item"]:
            if item["label"] == "wn":
              num_items += 1
            for part in item["text"].split("[-]"):
              part = part.strip()
              match = regex.search(r"\[synonym\]: (.*)", part)
              if match:
                for synonym in match.group(1).split(","):
                  synonym = synonym.strip()
                  if synonym:
                    count_synonyms[synonym] += 1
          synonyms = set()
          for synonym, count in count_synonyms.items():
            if count >= num_items:
              synonyms.add(synonym)
          if synonyms:
            dedup_words.append((word, synonyms))
          duplicated = False
          for dedup_word, dedup_synonyms in dedup_words:
            if word[0] == dedup_word[0]:
              dist = tkrzw.Utility.EditDistanceLev(word, dedup_word)
              if dist <= 1 and (word in dedup_synonyms or dedup_word in synonyms):
                aliases[dedup_word].add(word)
                duplicated = True
          if duplicated:
            skipped_words.append(word)
            continue
          main_words.append(word)
          break
      extra_words = []
      for extra_word in cluster[1]:
        if extra_word not in vetted_words:
          extra_words.append(extra_word)
      while len(main_words) < num_words and extra_words:
        main_words.append(extra_words[0])
        extra_words = extra_words[1:]
      for skipped_word in skipped_words:
        extra_words.insert(0, skipped_word)
      for word in main_words:
        surfaces = [word]
        surfaces.extend(aliases.get(word) or [])
        main_surface = ""
        other_surfaces = []
        if len(surfaces) == 1:
          main_surface = surfaces[0]
        else:
          prob_surfaces = []
          for surface in surfaces:
            prob = float(phrase_dbm.GetStr(surface) or "0")
            prob_surfaces.append((surface, prob))
          prob_surfaces = sorted(prob_surfaces, key=lambda x: x[1], reverse=True)
          main_surface = prob_surfaces[0][0]
          other_surfaces = [x[0] for x in prob_surfaces[1:]]
        section_main_words.append((main_surface, other_surfaces))
      section_extra_word_lists.append(extra_words)
    for main_word in section_main_words:
      out_words.append(main_word[0])
    out_path = os.path.join(self.output_path, "study-{:03d}.xhtml".format(num_sections))
    logger.info("Creating: {}".format(out_path))

    with open(out_path, "w") as out_file:
      self.OutputStudy(out_file, num_sections, has_next,
                       section_main_words, section_extra_word_lists,
                       body_dbm, uniq_words, out_children)
    out_path = os.path.join(self.output_path, "check-{:03d}.xhtml".format(num_sections))
    logger.info("Creating: {}".format(out_path))
    with open(out_path, "w") as out_file:
      self.OutputCheck(out_file, num_sections, has_next, out_words, out_children, body_dbm)
Ejemplo n.º 52
0
    def getCoverageProblems(self):
        """Verify that each target, rule and exclusion has the right number of tests
                 that applies to it. Also check that each target has the right
                 number of tests. In particular left-wildcard targets should have at least
                 three tests. Right-wildcard targets should have at least ten tests.

                 Returns an array of strings reporting any coverage problems if they exist,
                 or empty list if coverage is sufficient.
                 """
        self._determineTestApplication()
        problems = []

        # First, check each target has the right number of tests
        myTestTargets = []

        # Only take tests which are not excluded into account
        for test in self.tests:
            if not self.excludes(test.url):
                urlParts = urlparse(test.url)
                hostname = urlParts.hostname
                myTestTargets.append(hostname)

        for target in self.targets:
            actual_count = 0
            needed_count = 1

            if target.startswith("*."):
                needed_count = 3

            if target.endswith(".*"):
                needed_count = 10

            # non-wildcard target always have a implicit test url, if is it not excluded
            if not "*" in target and not self.excludes(
                ("http://{}/".format(target))):
                continue

            # According to the logic in rules.js available at
            # EFForg/https-everywhere/blob/07fe9bd51456cc963c2d99e327f3183e032374ee/chromium/rules.js#L404
            #
            pattern = target.replace('.', '\.')  # .replace('*', '.+')

            # `*.example.com` matches `bar.example.com` and `foo.bar.example.com` etc.
            if pattern[0] == '*':
                pattern = pattern.replace('*', '.+')

            # however, `example.*` match `example.com` but not `example.co.uk`
            if pattern[-1] == '*':
                pattern = pattern.replace('*', '[^\.]+')

            # `www.*.example.com` match `www.image.example.com` but not `www.ssl.image.example.com`
            pattern = pattern.replace('*', '[^\.]+')

            pattern = '^' + pattern + '$'

            for test in myTestTargets:
                if regex.search(pattern, test) is not None:
                    actual_count += 1

                    if not actual_count < needed_count:
                        break

            if actual_count < needed_count:
                problems.append(
                    "{}: Not enough tests ({} vs {}) for {}".format(
                        self.filename, actual_count, needed_count, target))

        # Next, make sure each rule or exclusion has sufficient tests.
        for rule in self.rules:
            needed_count = 1 + len(regex.findall("[+*?|]", rule.fromPattern))
            # Don't treat the question mark in non-capturing and lookahead groups as increasing the
            # number of required tests.
            needed_count = needed_count - \
                len(regex.findall("\(\?:", rule.fromPattern))
            needed_count = needed_count - \
                len(regex.findall("\(\?!", rule.fromPattern))
            needed_count = needed_count - \
                len(regex.findall("\(\?=", rule.fromPattern))
            # Don't treat escaped questions marks as increasing the number of required
            # tests.
            needed_count = needed_count - \
                len(regex.findall("\\?", rule.fromPattern))
            actual_count = len(rule.tests)
            if actual_count < needed_count:
                problems.append(
                    "{}: Not enough tests ({} vs {}) for {}".format(
                        self.filename, actual_count, needed_count, rule))
                pass
        for exclusion in self.exclusions:
            needed_count = 1 + \
                len(regex.findall("[+*?|]", exclusion.exclusionPattern))
            needed_count = needed_count - \
                len(regex.findall("\(\?:", exclusion.exclusionPattern))
            needed_count = needed_count - \
                len(regex.findall("\\?", rule.fromPattern))
            actual_count = len(exclusion.tests)
            if actual_count < needed_count:
                problems.append(
                    "{}: Not enough tests ({} vs {}) for {}".format(
                        self.filename, actual_count, needed_count, exclusion))
        return problems
Ejemplo n.º 53
0
def web_tech(url, no_cache=False, verbose=False):

    if verbose:
        logging.basicConfig(level=logging.INFO, format='%(message)s')

    if not no_cache:
        homedir = Path(os.path.expanduser('~'))
        requests_cache.install_cache(str(homedir / '.habu_requests_cache'),
                                     expire_after=3600)

    try:
        r = requests.get(url)
    except Exception as e:
        logging.error(e)
        return False

    with (DATADIR / 'apps-habu.json').open() as f:
        data = json.load(f)

    apps = data['apps']
    categories = data['categories']

    content = r.text
    soup = BeautifulSoup(content, "lxml")
    tech = {}

    for app in apps:

        version_group = False

        for header in apps[app].get('headers', []):
            if header in r.headers:

                header_regex = apps[app]['headers'][header].split('\;')[0]

                if '\;version:\\' in apps[app]['headers'][header]:
                    version_group = apps[app]['headers'][header].split(
                        '\;version:\\')[1]

                match = re.search(header_regex,
                                  r.headers[header],
                                  flags=re.IGNORECASE)
                if match or not header_regex:
                    logging.info(
                        "{app} detected by {header} HTTP header = {header_content}"
                        .format(app=app,
                                header=header,
                                header_content=r.headers[header]))
                    if app not in tech:
                        tech[app] = apps[app]

                    if version_group and version_group.isdigit():
                        try:
                            version = match.group(int(version_group))
                            if version:
                                tech[app]['version'] = version
                                logging.info(
                                    "The version detected is {version}".format(
                                        version=version))
                        except IndexError:
                            pass

        for key in ['script', 'html']:

            version_group = False

            for item in apps[app].get(key, []):
                item_regex = item.split('\;')[0]

                if '\;version:\\' in item:
                    version_group = item.split('\;version:\\')[1]

                match = re.search(item_regex,
                                  r.text,
                                  flags=re.IGNORECASE & re.MULTILINE)
                if match:
                    logging.info(
                        "{app} detected by HTML body with regex {regex}".
                        format(app=app, regex=item_regex))
                    if app not in tech:
                        tech[app] = apps[app]

                    if version_group and version_group.isdigit():

                        try:
                            version = match.group(int(version_group))
                            if version:
                                tech[app]['version'] = version
                                logging.info(
                                    "The version detected is {version}".format(
                                        version=version))
                        except IndexError:
                            pass

        for url_regex in apps[app].get('url', []):
            match = re.search(url_regex,
                              url,
                              flags=re.IGNORECASE & re.MULTILINE)
            if match:
                logging.info("{app} detected by URL with regex {regex}".format(
                    app=app, regex=url_regex))
                if app not in tech:
                    tech[app] = apps[app]

        for cookie_name in apps[app].get('cookies', []):

            for cookie in r.cookies:
                if cookie_name == cookie.name:
                    logging.info("{app} detected by cookie {cookie}".format(
                        app=app, cookie=cookie.name))

                    if app not in tech:
                        tech[app] = apps[app]

        for meta in apps[app].get('meta', []):

            version_group = False

            for tag in soup.find_all("meta", attrs={'name': meta}):
                meta_regex = apps[app]['meta'][meta]

                if '\;version:\\' in meta_regex:
                    version_group = meta_regex.split('\;version:\\')[1]

                meta_regex = meta_regex.split('\;')[0]

                try:
                    match = re.search(meta_regex,
                                      tag['content'],
                                      flags=re.IGNORECASE)
                except KeyError:
                    continue

                if match:
                    logging.info(
                        "{app} detected by meta {meta} tag with regex {regex}".
                        format(app=app, meta=meta, regex=meta_regex))

                    if app not in tech:
                        tech[app] = apps[app]

                    if version_group and version_group.isdigit():

                        try:
                            version = match.group(int(version_group))
                            if version:
                                tech[app]['version'] = version
                                logging.info(
                                    "The version detected is {version}".format(
                                        version=version))

                        except IndexError:
                            pass

    for t in list(tech.keys()):
        for imply in tech[t].get('implies', []):
            imply = imply.split('\\;')[0]
            if imply not in tech:
                logging.info("{imply} detected because implied by {t}".format(
                    imply=imply, t=t))
                tech[imply] = apps[imply]

    for t in list(tech.keys()):
        for exclude in tech[t].get('excludes', []):
            logging.info(
                "removing {exlude} because its excluded by {t}".format(
                    exlude=exclude, t=t))
            del (tech[t])

    response = {}

    for t in sorted(tech):
        response[t] = {'categories': []}
        if 'version' in tech[t]:
            response[t]['version'] = tech[t]['version']
        for category in tech[t]['cats']:
            response[t]['categories'].append(categories[str(category)]['name'])

    return response
Ejemplo n.º 54
0
def delete(title):
    if re.search(r'B5.5', title):
        return title
    return None
Ejemplo n.º 55
0
    def parse_number_with_month(self, source: str,
                                reference: datetime) -> DateTimeParseResult:
        trimmed_source = source.strip()
        ambiguous = True
        result = DateTimeResolutionResult()

        ers = self.config.ordinal_extractor.extract(trimmed_source)

        if not ers:
            ers = self.config.integer_extractor.extract(trimmed_source)

        if not ers:
            return result

        num = int(self.config.number_parser.parse(ers[0]).value)
        day = 1
        month = 0

        match = regex.search(self.config.month_regex, trimmed_source)

        if match:
            month = self.config.month_of_year.get(match.group())
            day = num
        else:
            # handling relative month
            match = regex.search(self.config.relative_month_regex,
                                 trimmed_source)
            if match:
                month_str = match.group('order')
                swift = self.config.get_swift_month(month_str)
                date = reference.replace(month=reference.month + swift)
                month = date.month
                day = num
                ambiguous = False

        # handling casesd like 'second Sunday'
        if not match:
            match = regex.search(self.config.week_day_regex, trimmed_source)
            if match:
                month = reference.month
                # resolve the date of wanted week day
                wanted_week_day = self.config.day_of_week.get(
                    match.group('weekday'))
                first_date = DateUtils.safe_create_from_min_value(
                    reference.year, reference.month, 1)
                first_weekday = first_date.isoweekday()
                delta_days = wanted_week_day - first_weekday if wanted_week_day > first_weekday else wanted_week_day - first_weekday + 7
                first_wanted_week_day = first_date + timedelta(days=delta_days)
                day = first_wanted_week_day.day + ((num - 1) * 7)
                ambiguous = False

        if not match:
            return result

        year = reference.year

        # for LUIS format value string
        date = DateUtils.safe_create_from_min_value(year, month, day)
        future_date = date
        past_date = date

        if ambiguous:
            result.timex = DateTimeFormatUtil.luis_date(-1, month, day)

            if future_date < reference:
                future_date = future_date.replace(year=future_date.year + 1)

            if past_date >= reference:
                past_date = past_date.replace(year=past_date.year + 1)
        else:
            result.timex = DateTimeFormatUtil.luis_date(year, month, day)

        result.future_value = future_date
        result.past_value = past_date
        result.success = True
        return result
Ejemplo n.º 56
0
def render(source, record):
    katexstorage = {}

    # convert html character references (ie. &#62;) to unicode
    source = html.unescape(source)
    # convert <cp>...</cp>
    source = re.sub(CP_REGEX, r'<div class="grey-block">\1</div>', source)
    # convert <cpb>...</cpb>
    source = re.sub(CPB_REGEX, r'<div class="blue-block">\1</div>', source)
    # convert <Q>...</Q>
    source = re.sub(Q_REGEX, r'<blockquote>\1</blockquote>', source)
    # convert <k>...</k>
    source = re.sub(K_REGEX, r'<div class="center-paragraph">\1</div>', source)
    # convert <ind>...</ind>
    source = re.sub(IND_REGEX, r'<div class="indent-paragraph">\1</div>',
                    source)
    # convert latex to katex
    source = re.sub(LATEX_REGEX,
                    lambda match: katexprerender(match, katexstorage), source)
    # convert ^superscript
    source = re.sub(SUPERSCRIPT_REGEX, r'<span class="superscript">\1</span>',
                    source)
    # convert ¬subscript
    source = re.sub(SUBSCRIPT_REGEX, r'<span class="subscript">\1</span>',
                    source)
    # convert <m>...</m> and <m name>...</m>
    source = re.sub(MLINK_REGEX, lambda match: mrender(match, record), source)
    # convert <g glossary>...</g>
    source = re.sub(GLINK_REGEX, lambda match: glrender(match, record), source)
    # convert <ac academy>...</ac>
    source = re.sub(ACLINK_REGEX, lambda match: societyrender(match, record),
                    source)
    # convert <E num>
    source = re.sub(ELINK_REGEX, lambda match: extrarender(match, record),
                    source)
    # convert <r>...</r>
    source = source.replace('<r>', '<span class="red-text">')
    source = source.replace('</r>', '</span>')
    # convert <bl>...</bl>
    source = source.replace('<bl>', '<span class="blue-text">')
    source = source.replace('</bl>', '</span>')
    # convert <gr>...</gr>
    source = source.replace('<gr>', '<span class="green-text">')
    source = source.replace('</gr>', '</span>')
    # convert <bro>...</bro>
    source = source.replace('<bro>', '<span class="brown-text">')
    source = source.replace('</bro>', '</span>')
    # convert <f+>...</f+>
    source = re.sub(FPLUS_REGEX, r'<span class="bigger">\1</span>', source)
    # convert <fp>...</fp>
    source = re.sub(FP_REGEX, r'<span class="bigger">\1</span>', source)
    # convert <f++>...</f++>
    source = re.sub(
        FPLUSPLUS_REGEX,
        r'<span class="bigger"><span class="bigger">\1</span></span>', source)
    # convert <f->...</f->
    source = re.sub(FMINUS_REGEX, r'<span class="smaller">\1</span>', source)
    # convert <fm>...</fm>
    source = re.sub(FM_REGEX, r'<span class="smaller">\1</span>', source)
    # convert <c>...</c>
    source = source.replace('<c>', '<code>')
    source = source.replace('</c>', '</code>')
    # convert <ovl>...</ovl>
    source = source.replace('<ovl>', '<span class="overline">')
    source = source.replace('</ovl>', '</span>')
    # convert <d ...>
    source = re.sub(DIAGRAM_REGEX, lambda match: drender(match, record),
                    source)
    # convert [refnum]
    source = re.sub(REF_REGEX, lambda match: referencerender(match, record),
                    source)
    #source = re.sub(regex, r'<span>[<a href="#reference-\1" class="reference reference-\1">\1</a>]</span>', source)
    # convert <T num>
    source = re.sub(TRANS_REGEX, lambda match: trender(match, record), source)

    # other from the htmlformat function in the stack

    # new (improved?) break-adder
    TAGS_MATCHER = r'</?((?:n)|(?:table)|(?:tr)|(?:td(\s+colspan="?\d"?)?)|(?:figure)|(?:p)|(?:br)|(?:li)|(?:ol)|(?:ul)|(?:div(\s+id))|(?:div(\s+class="indent-paragraph"?)?)|(?:div(\s+class="center-paragraph"?)?)|(?:script)|(?:input)|(?:button)|(?:br ?/?)|(?:p)|(?:blockquote)|(?:code)|(?:h\d)|(?:hr ?/?)|(?:area)|(?:map))>'
    regex = re.compile(
        r'(?<!%s)\s*?\n(?!\s*%s)' % (TAGS_MATCHER, TAGS_MATCHER),
        re.MULTILINE | re.DOTALL)
    source = re.sub(regex, '\n<br>\n', source)

    # never more than two <br>s together
    match = re.search(BR_REGEX, source)
    while match:
        source = re.sub(BR_REGEX, '<br>\n<br>', source)
        match = re.search(BR_REGEX, source)

    # remove all the <n>s
    source = source.replace('<n>', '')

    # smart quotes
    source = source.replace('’', "'")
    source = source.replace('‘', "'")
    source = source.replace('“', '"')
    source = source.replace('”', '"')

    source = source.replace('<clear>', '<br clear="right">')
    source = source.replace('<clearl>', '<br clear="left">')
    source = source.replace('<proofend>',
                            '<d xproofend right><br clear=right>')

    source = tags_to_unicode(source)

    source = fix_italics(source, record)

    # put the katex formulas back in
    latex_array = list(katexstorage.values())
    html_array = katexrender(latex_array)
    for idx, key in enumerate(katexstorage.keys()):
        html_formula = html_array[idx]
        source = source.replace(key, html_formula)

    return '<span class="markup">%s</span>' % source
Ejemplo n.º 57
0
    def number_with_month(self, source: str,
                          reference: datetime) -> List[Token]:
        ret: List[Token] = list()
        extract_results = self.config.ordinal_extractor.extract(source)
        extract_results.extend(self.config.integer_extractor.extract(source))

        for result in extract_results:
            num = int(self.config.number_parser.parse(result).value)

            if num < 1 or num > 31:
                continue

            if result.start >= 0:
                front_string = source[0:result.start or 0]
                match = regex.search(self.config.month_end, front_string)

                if match is not None:
                    ret.append(
                        Token(match.start(),
                              match.end() + result.length))
                    continue

                # handling cases like 'for the 25th'
                matches = regex.finditer(self.config.for_the_regex, source)
                is_found = False

                for match_case in matches:
                    if match_case is not None:
                        ordinal_num = RegExpUtility.get_group(
                            match_case, 'DayOfMonth')

                        if ordinal_num == result.text:
                            length = len(
                                RegExpUtility.get_group(match_case, 'end'))
                            ret.append(
                                Token(match_case.start(),
                                      match_case.end() - length))
                            is_found = True

                if is_found:
                    continue

                # handling cases like 'Thursday the 21st', which both 'Thursday' and '21st' refer to a same date
                matches = regex.finditer(
                    self.config.week_day_and_day_of_month_regex, source)

                for match_case in matches:
                    if match_case is not None:
                        ordinal_num = RegExpUtility.get_group(
                            match_case, 'DayOfMonth')

                        if ordinal_num == result.text:
                            month = reference.month
                            year = reference.year

                            # get week of day for the ordinal number which is regarded as a date of reference month
                            date = DateUtils.safe_create_from_min_value(
                                year, month, num)
                            num_week_day_str: str = calendar.day_name[
                                date.weekday()].lower()

                            # get week day from text directly, compare it with the weekday generated above
                            # to see whether they refer to a same week day
                            extracted_week_day_str = RegExpUtility.get_group(
                                match_case, 'weekday').lower()
                            if (date != DateUtils.min_value and
                                    self.config.day_of_week[num_week_day_str]
                                    == self.config.
                                    day_of_week[extracted_week_day_str]):
                                ret.append(
                                    Token(match_case.start(),
                                          match_case.end()))
                                is_found = True

                if is_found:
                    continue

                # handling cases like '20th of next month'
                suffix_str: str = source[result.start + result.length:].lower()
                match = regex.match(self.config.relative_month_regex,
                                    suffix_str.strip())
                space_len = len(suffix_str) - len(suffix_str.strip())

                if match is not None and match.start() == 0:
                    ret.append(
                        Token(
                            result.start, result.start + result.length +
                            space_len + len(match.group())))

                # handling cases like 'second Sunday'
                match = regex.match(self.config.week_day_regex,
                                    suffix_str.strip())
                if (match is not None and match.start() == 0 and num >= 1
                        and num <= 5
                        and result.type == NumberConstants.SYS_NUM_ORDINAL):
                    week_day_str = RegExpUtility.get_group(match, 'weekday')

                    if week_day_str in self.config.day_of_week:
                        ret.append(
                            Token(
                                result.start, result.start + result.length +
                                space_len + len(match.group())))

            if result.start + result.length < len(source):
                after_string = source[result.start + result.length:]
                match = regex.match(self.config.of_month, after_string)

                if match is not None:
                    ret.append(
                        Token(
                            result.start,
                            result.start + result.length + len(match.group())))

        return ret
Ejemplo n.º 58
0
for aktuelle_zahl in range(0, 10):

    verzeichnis = './TESTSET/' + str(aktuelle_zahl) + '/'

    print("Verzeichnis: ", verzeichnis)

    regex_png = regex.compile('f*.png')

    with os.scandir(verzeichnis) as entries:

        #print("Anzahl der Einträge ", len(entries))

        for entry in entries:


            if regex.search(regex_png, entry.name) :
                pfad = verzeichnis + entry.name
                image = mpimg.imread(pfad)
                image *= 255


                # für Testing
                x_test[test_index_dataset] = image
                y_test[test_index_dataset] = aktuelle_zahl
                test_index_dataset=test_index_dataset+1

                # für Training - ab Index 10.000
                x_train[train_index_dataset] = image
                y_train[train_index_dataset] = aktuelle_zahl
                train_index_dataset = train_index_dataset + 1
Ejemplo n.º 59
0
def add_cpgs(objs):
    sp_cc = 0
    cc = 0
    ##  这里是给出了由训练集得出的一些明显不可能是出品公司的关键字,其中和文化、和传媒等关键字因为其中的和会被当成连接字符给且分开所以也屏蔽掉
    forbid_kws = [
        '》',
        '制作',
        '投资',
        '在',
        '担任',
        '联手',
        '怎么',
    ]
    forbid_kws += ['摄制', '电视剧', '和传媒', '旗下', '庆祝', '和平', '自家', '主演', '和文化']
    relist, _ = get_re()
    relist = relist['出品公司']
    strip_kw = ['一部', '一部由', '由', '联合', '共同', '独家', '合作', '著', '创作']
    for obj in objs:
        havecp = False
        for spo in obj['spo_list']:
            if spo['predicate'] == '出品公司':
                havecp = True
        if havecp or '出品' in obj['text']:
            # print_spo(obj['spo_list'],'出品公司')
            # print( obj['text'] )
            texts = split_text(obj['text'], r"[?。,]")
            for text in texts:
                text = text[text.find('》') + 1:]
                for restr in relist:
                    reresult = regex.search(restr[0], text)
                    if reresult != None:
                        reresult = str(reresult.group())
                        ##等后面就不要了
                        if '等' in reresult:
                            reresult = reresult[:reresult.find('等')]
                        cflag = True
                        for fkw in forbid_kws:
                            if fkw in reresult:
                                cflag = False
                                break
                        if not cflag:
                            break
                        for rss in [
                                r'\d{4}年\d{1,2}月\d{1,2}日', r'\d{4}年\d{1,2}月',
                                r'\d{4}年'
                        ]:
                            yearresult = re.search(rss, reresult)
                            if yearresult is not None:
                                yearresult = yearresult.group()
                                if '于' + yearresult in reresult:
                                    reresult = reresult[:reresult.
                                                        find('于' + yearresult)]
                                reresult = reresult.replace(yearresult, '')
                                break
                        ##去掉
                        for skw in strip_kw:
                            if skw in reresult:
                                reresult = reresult.strip(skw)
                        reresult = reresult.rstrip('于')
                        if reresult != '':
                            reresult = set(split_text(reresult, r"[、与和及]"))
                            temp_set = []
                            for rst in reresult:
                                ##如果是一个数字
                                if '' != rst and not rst[0].isdigit():
                                    if '联合' in rst:
                                        temp_set.extend(rst.split('联合'))
                                    else:
                                        temp_set.append(rst)
                            reresult = set(temp_set)
                            # print('!!!!!',reresult)
                            ents = set()
                            for spo in obj['spo_list']:
                                if spo['predicate'] == '出品公司':
                                    ents.add(spo['object'])
                            dif = reresult - ents
                            if len(dif) > 0:
                                cc += len(dif)
                                # print(dif ,reresult,obj['text'])
                                ## 处理那些有same的
                                same = reresult & ents
                                if len(same) > 0:
                                    sp_set = set()
                                    for sm in same:
                                        for spo in obj['spo_list']:
                                            if sm == spo['object']:
                                                sp_set.add((spo['subject'],
                                                            spo['predicate']))
                                    for df in dif:
                                        for sp in sp_set:
                                            add_spo(obj, sp[0], df, sp[1])
                                            sp_cc += 1
                                    # print('!!! sp',sp_set)
                                else:
                                    ##如果只有一个书名号,就取书名号中的内容
                                    shumings = list(
                                        set(
                                            regex.findall(
                                                "(?<=《).*?(?=》)",
                                                obj['text'])))
                                    if len(shumings) == 1:
                                        for df in dif:
                                            add_spo(obj, shumings[0], df,
                                                    '出品公司')
                                            # print(shumings[0],df)
                                        print(shumings, dif, obj['text'])
                                        sp_cc += len(dif)
    print('sss', cc, sp_cc)
        text=text.decode("ascii", "ignore")
        return text
    except Exception as e:
        print(e)
        pass
    
outputDictionary={}
claim=convert_pdf_to_txt_v2("/home/ubuntu/environment/Claim.pdf")
claim=repr(claim)
#Regular Expressions
claimUpdated=re.sub(r"\\n\\n",r"\\n",claim)

insuredName=re.search(r"(?<=Name and address of Insured\\n)[A-Za-z ]+",claimUpdated)[0]
outputDictionary["Insured Name:"]=insuredName

insuredAddressL1=regex.search(r"Name and address of Insured\\n[A-Za-z ]+\\n\K\S.*?(?=\\n)",claimUpdated)[0]
insuredAddressL2=regex.search(r"Name and address of Insured\\n[A-Za-z ]+\\n[1-9A-Za-z ]+\\n\K\S.*?(?=\\n)",claimUpdated)[0]
insuredAddressL3=regex.search(r"Name and address of Insured\\n[A-Za-z ]+\\n[1-9A-Za-z ]+\\n[A-Za-z ]+\\n\K\S.*?(?=\\n)",claimUpdated)[0]
insuredAddress=insuredAddressL1+" "+insuredAddressL2+" "+insuredAddressL3
outputDictionary["Insured Address:"]=insuredAddress

dob=regex.search(r"DOB:\s*\K\S.*?(?=\\n)",claimUpdated)[0]
outputDictionary["Date of birth:"]=dob

employersName=regex.search(r"Name:\s*\K\S.*?(?=\\n)",claimUpdated)[0]
outputDictionary["Employer's Name:"]=employersName

employersLocation=regex.search(r"Location:\s*\K\S.*?(?=\\n)",claimUpdated)[0]
outputDictionary["Employer's Location:"]=employersLocation

claim=re.search(r"(?<=Medical History).*$",claim)[0]