Ejemplos de sub en Python, ejemplos de regex.sub en Python

Ejemplo n.º 1

0

Mostrar archivo

Archivo: adhocSuumoJournalParser.py Proyecto: GINK03/KindleReferencedIndexScore

def html_adhoc_fetcher(url):
    html = None
    for _ in range(5):
        opener = urllib2.build_opener()
        TIME_OUT = 5
        try:
            html = opener.open(str(url), timeout = TIME_OUT).read()
        except :
            print('[WARN] Cannot access url with UnicodeEncodeError, try number is...', e, _, url, mp.current_process() )
            continue
    #print "b"
    if html == None:
        return None
    line = html.replace('\n', '^A^B^C')
    line = regex.sub('<!--.*?-->', '',  line)
    line = regex.sub('<style.*?/style>', '',  line)
    html = regex.sub('<script.*?/script>', '', line ).replace('^A^B^C', ' ')
 
    #print "c"
    soup = bs4.BeautifulSoup(html, "html.parser")
    title = (lambda x:unicode(x.string) if x != None else 'Untitled')( soup.title )
    contents0_text = (lambda x:x.text.encode('utf-8') if x != None else "" )( soup.find('div', {'class': 'ui-section-body'}) )
    #contents0_text = "dummy"
    links = set([a['href'] for a in soup.find_all('a', href=True)])
    return title, contents0_text, links

Ejemplo n.º 2

0

Mostrar archivo

Archivo: spanish.py Proyecto: garykpdx/panlex-tools

 def remove_article(self, text):
     for art in self.articles:
         text = re.sub('\s*\m%s\M\s*' % art, ' ', text)
         
     text = re.sub('\mdel\M', 'de', text)
     text = re.sub('^\s*es\M\s*', '', text)
     return text.strip()

Ejemplo n.º 3

0

Mostrar archivo

Archivo: quotations.py Proyecto: afedosenko/talon

def preprocess(msg_body, delimiter, content_type='text/plain'):
    """Prepares msg_body for being stripped.

    Replaces link brackets so that they couldn't be taken for quotation marker.
    Splits line in two if splitter pattern preceded by some text on the same
    line (done only for 'On <date> <person> wrote:' pattern).
    """
    # normalize links i.e. replace '<', '>' wrapping the link with some symbols
    # so that '>' closing the link couldn't be mistakenly taken for quotation
    # marker.
    def link_wrapper(link):
        newline_index = msg_body[:link.start()].rfind("\n")
        if msg_body[newline_index + 1] == ">":
            return link.group()
        else:
            return "@@%s@@" % link.group(1)

    msg_body = re.sub(RE_LINK, link_wrapper, msg_body)

    def splitter_wrapper(splitter):
        """Wraps splitter with new line"""
        if splitter.start() and msg_body[splitter.start() - 1] != '\n':
            return '%s%s' % (delimiter, splitter.group())
        else:
            return splitter.group()

    if content_type == 'text/plain':
        msg_body = re.sub(RE_ON_DATE_SMB_WROTE, splitter_wrapper, msg_body)

    return msg_body

Ejemplo n.º 4

0

Mostrar archivo

Archivo: arrffMaker2.py Proyecto: cvhooper22/cs478TwitterGroup

def clean_tweet_text(tweet_text):
    tweet_text = tweet_text.lower()
    tweet_text = re.sub(ur"\p{P}+", "", tweet_text)
    tweet_text = re.sub("[^a-zA-Z\s]","", tweet_text)
    tweet_text = filter(lambda x: x in string.printable, tweet_text)
    tweet_text.encode('ascii',errors='ignore')
    return tweet_text

Ejemplo n.º 5

0

Mostrar archivo

Archivo: string_manipulation.py Proyecto: longnow/panlex-tools

def expand_parens(string, parens="()", include_spaces=False, substitute_string=''):
    output = []
    open_paren = re.escape(parens[0])
    close_paren = re.escape(parens[1])
    substitute_string = re.escape(substitute_string)
    in_string = re.sub(open_paren + substitute_string, parens[0], string)
    in_string = re.sub(substitute_string + close_paren, parens[1], in_string)

    if include_spaces:
        regex1 = regex2 = re.compile(r'(^.*)' + open_paren + r'(.+)' + close_paren + r'(.*$)')
    else:
        regex1 = re.compile(r'(^.*\S)' + open_paren + r'(\S+)' + close_paren + r'(.*$)')
        regex2 = re.compile(r'(^.*)' + open_paren + r'(\S+)' + close_paren + r'(\S.*$)')

    re_match1 = regex1.search(in_string)
    re_match2 = regex2.search(in_string)
    if re_match1:
        within = re_match1.group(1) + re_match1.group(2) + re_match1.group(3)
        without = re_match1.group(1) + re_match1.group(3)
    elif re_match2:
        within = re_match2.group(1) + re_match2.group(2) + re_match2.group(3)
        without = re_match2.group(1) + re_match2.group(3)
    else:
        return [string]

    output = [clean_str(without), clean_str(within)]

    return output

Ejemplo n.º 6

0

Mostrar archivo

Archivo: scraper_chinukh_rambam.py Proyecto: Sefaria/Sefaria-Data

def scrape_wiki():
    url = u"https://he.wikipedia.org/wiki/%D7%9E%D7%A0%D7%99%D7%99%D7%9F_%D7%94%D7%9E%D7%A6%D7%95%D7%95%D7%AA_%D7%A2%D7%9C_%D7%A4%D7%99_%D7%A1%D7%A4%D7%A8_%D7%94%D7%97%D7%99%D7%A0%D7%95%D7%9A"

    page = requests.get(url)
    soup_body = BeautifulSoup(page.text, "lxml")
    tables = soup_body.select(".mw-parser-output > table")

    pairs = []
    links = []

    for table in tables:
        table_tr = table.select("tr")
        for col in table_tr:
            pairs.append((col.contents[1].text.strip(), re.sub(u'</?td>', u'', col.contents[-1].text).strip()))

    for pair in pairs:
        if re.search(u'ספר|מספר', pair[0]):
            continue
        neg_pos = u"Negative Mitzvot" if re.search(u"לאו", pair[1]) else u'Positive Mitzvot'
        rambam = getGematria(re.sub(u'עשה|לאו', u'', pair[1]).strip())
        chinukh = getGematria(pair[0])
        print chinukh, rambam
        chinukh_simanlen = len(Ref(u'Sefer HaChinukh.{}'.format(chinukh)).all_segment_refs())
        print neg_pos
        link = ({"refs": [
            u'Sefer HaChinukh.{}.{}-{}'.format(chinukh, 1, chinukh_simanlen),
            u'Mishneh Torah, {}.{}'.format(neg_pos, rambam)
        ],
            "type": "Sifrei Mitzvot",
            "auto": True,
            "generated_by": "chinukh_rambam_sfm_linker"  # _sfm_linker what is this parametor intended to be?
        })
        print link['refs']
        links.append(link)
        return links

Ejemplo n.º 7

0

Mostrar archivo

Archivo: TextCleaner.py Proyecto: RenatKhayrullin/Diploma

 def clean_text(text):
     clear_text_regexp = re.compile(r'(?u)\w+|[,.!?]')
     text_ = " ".join(clear_text_regexp.findall(text)).replace(" .", ".").replace(" ,", ",")
     text_ = re.sub("[,]+", ",", text_)
     text_ = re.sub("[.]+", ".", text_)
     text_ = re.sub("\s+", " ", text_)
     return text_

Ejemplo n.º 8

0

Mostrar archivo

Archivo: findspam.py Proyecto: rekire/SmokeDetector

def all_caps_text(s, site):
    s = regex.sub("<[^>]*>", "", s)   # remove HTML tags
    s = regex.sub("&\w+;", "", s)     # remove HTML entities
    if len(s) <= 150 and regex.compile(ur"SQL|\b(ERROR|PHP|QUERY|ANDROID|CASE|SELECT|HAVING|COUNT|GROUP|ORDER BY|INNER|OUTER)\b").search(s):
        return False, ""   # common words in non-spam all-caps titles
    if len(s) >= 25 and regex.compile(ur"^(?=.*\p{upper})\P{lower}*$", regex.UNICODE).search(s):
        return True, "All in caps"

Ejemplo n.º 9

0

Mostrar archivo

Archivo: search_by_phrase.py Proyecto: baileymiller/intertextualityProject

def main():

	transDict, Greek_word_num, Greek_search_dict, Greek_text = preprocessing()

	# Save lemma to translations found
	found_translist = {}

	try:
		while (True):

			scoreKeeper = scoreboard(MAX_SCOREBOARD_SIZE, MIN_SCORE)

			input_phrase = input("Enter Search Phrase>  ")

			if re.sub(" ", "", re.sub("q", "", input_phrase)) == "" or re.sub(" ", "", re.sub("quit", "", input_phrase)) == "":
				exit(0)

			if (valid_search(input_phrase)):
				
				search = search_phrase(input_phrase, "Latin")

				# Find all the translations of the given words
				for i in range(search.search_len):
					search.has_translation[i] = trn.get_translation(search.text[i], transDict, found_translist)
		
				xls.try_all_search_combos(search, scoreKeeper, Greek_word_num, Greek_search_dict, Greek_text)

				print(scoreKeeper)

			else:
				print('Please enter a valid string\n')

	except KeyboardInterrupt:
		print('\nProgram Terminated\n')
		sys.exit(0)

Ejemplo n.º 10

0

Mostrar archivo

Archivo: lex.py Proyecto: amitdo/nidaba

def tei_spellcheck(facsimile, dictionary, deletion_dictionary,
                   filter_punctuation=False):
    """
    Performs a spell check on an TEI XML document.

    Each ``seg`` element is treated as a single word and spelling corrections
    will be inserted using a choice tag. Correct words will be untouched and
    correction candidates will be sorted by edit distance.

    Args:
        facsimile (nidaba.tei.TEIFacsimile): TEIFacsimile object.
        dictionary (unicode): Path to a base dictionary.
        deletion_dictionary (unicode): Path to a deletion dictionary.
        filter_punctuation (bool): Switch to filter punctuation inside
                                   segments.

    Returns:
        A TEIFacsimile object containing the spelling corrections.
    """
    text_tokens = [x[-1] for x in facsimile.segments]
    if filter_punctuation:
        text_tokens = [regex.sub('[^\w]', '', x) for x in text_tokens]
    suggestions = spellcheck(text_tokens, dictionary, deletion_dictionary)
    facsimile.add_respstmt('spell-checker', 'nidaba-levenshtein')
    for segment in facsimile.segments:
        key = alg.sanitize(segment[-1])
        if filter_punctuation:
            key = regex.sub('[^\w]', '', key)
        if key not in suggestions:
            continue
        for sugg in suggestions[key]:
            facsimile.add_choices(segment[-2], [(sugg, 100 - 10 *
                                  alg.edit_distance(key, sugg))])
    return facsimile

Ejemplo n.º 11

0

Mostrar archivo

Archivo: Parse_Index_and_Version.py Proyecto: bachrach44/Sefaria-Project

def parse_text(element):
    n = (element.attrib["_note"])
    n = re.sub(r'[/]', '<br>', n)
    n = re.sub(r'[(]', '<em><small>', n)
    n = re.sub(r'[)]', '</small></em>', n)
    prayer = n.strip().splitlines()
    return prayer

Ejemplo n.º 12

0

Mostrar archivo

Archivo: utils.py Proyecto: crw/python-textile

def normalize_newlines(string):
    out = string.strip()
    out = re.sub(r'\r\n', '\n', out)
    out = re.sub(r'\n{3,}', '\n\n', out)
    out = re.sub(r'\n\s*\n', '\n\n', out)
    out = re.sub(r'"$', '" ', out)
    return out

Ejemplo n.º 13

0

Mostrar archivo

Archivo: bookwormMARC.py Proyecto: Bookworm-project/Bookworm-MARC

def lcc_range(string):
    """
    Takes a string, returns a tuple of two LCClassNumbers, the start and
    end of the range.
    """
    string = string.encode("ascii","replace")
    string = string.replace("(","")
    string = string.replace(")","")
    if string.endswith("A-Z"):
        # TMI in the schedules when they're alphabetical.
        # I don't care.
        string.replace("A-Z","")

    if "-" not in string:
        # A range of self length.
        return (LCCallNumber(string), LCCallNumber(string))

    parts = string.split("-")
    if re.search(r"^\d",parts[1]):
        header = re.sub("^([A-Z]+).*",r"\1",parts[0])
    elif re.search(r"^\.",parts[1]):
        header = re.sub(r"^([A-Z]+\d)+\..*",r"\1",parts[0])
    elif re.search(r"^[A-Z]",parts[1]):
        header = re.sub(r"^([A-Z]+\d)+\..[A-Z]*",r"\1.",parts[0])            
    else:
        header = " "

    parts[1] = header + parts[1]
    return (
        LCCallNumber(parts[0]),
        LCCallNumber(parts[1])
    )

Ejemplo n.º 14

0

Mostrar archivo

Archivo: __init__.py Proyecto: jgpacker/suttacentral

def fix_broken_paragraphs(in_bytes):   
    out = in_bytes
    out = regex.sub(rb'''(?<=\p{lower}\s*)</(blockquote|p|div)>
                        \s*
                        <\1[^>]*>\s*(?=\p{lower})''', 
                        b' ',
                        out, flags=regex.VERBOSE|regex.I)
    
    out = regex.sub(rb'''(?<=\p{lower}\s*)
                        <p[^>]*>(?=\s*\p{lower})''', 
                        b' ',
                        out, flags=regex.VERBOSE|regex.I)
    
    # Deal with a wrong paragraph break on a hyphenated word
    # (v.ugly)
    out = regex.sub(rb'''(?<=\p{lower})-</(blockquote|p|div)>
                        \s*
                        <\1[^>]*>\s*(?=\p{lower})''', 
                        b'',
                        out, flags=regex.VERBOSE|regex.I)
    
    out = regex.sub(rb'(?<=\p{lower})-<p[^>]*>(?=\s*\p{lower})', 
                        b'',
                        out, regex.I)
    return out

Ejemplo n.º 15

0

Mostrar archivo

Archivo: unbound-dns-filter.py Proyecto: cbuijs/unbound-dns-filter

def rev_ip(ip, delimiter=None):
    revip = False
    eip = expand_ip(ip)
    prefix = False

    if '/' in eip:
        eip, prefix = regex.split('/', eip)[0:2]
    else:
        if is_ip4.search(eip):
            prefix = '32'
        elif is_ip6.search(eip):
            prefix = '128'

    if prefix:
        prefix = int(prefix)
        if is_ip4.search(eip):
            if prefix in (8, 16, 24, 32):
                revip = '.'.join(eip.split('.')[0:int(prefix / 8)][::-1]) + '.in-addr.arpa.'
            elif delimiter:
                octs = eip.split('.')[::-1]
                octs[3 - int(prefix / 8)] = octs[3 - int(prefix / 8)] + delimiter + str(prefix)
                revip = '.'.join(octs[3 - int(prefix / 8):]) + '.in-addr.arpa.'

        elif is_ip6.search(eip):
            if prefix in (4, 8, 12, 16, 20, 24, 28, 32, 36, 40, 44, 48, 52, 56, 60, 64, 68, 72, 76, 80, 84, 88, 92, 96, 100, 104, 108, 112, 116, 120, 124, 128):
                revip = '.'.join(filter(None, regex.split('(.)', regex.sub(':', '', eip))))[0:(prefix / 4) * 2][::-1].strip('.') + '.ip6.arpa.'
            elif delimiter:
                nibs = filter(None, regex.split('(.)', regex.sub(':', '', eip)))[::-1]
                nibs[31 - int(prefix / 4)] = nibs[31 - int(prefix /4)] + delimiter + str(prefix)
                revip = '.'.join(nibs[31 - int(prefix /4):]) + '.ip6.arpa.'

    return revip

Ejemplo n.º 16

0

Mostrar archivo

Archivo: tweet-soap.py Proyecto: asmiley/tweet-emotion-classifier

def cleanTweet(tweet, query_term):
    """
    """
    new_string = ''
    for i in tweet.split(): # remove urls, hashtag characters, and full @username mentions
        s, n, p, pa, q, f = urlparse.urlparse(i)
        if s and n:
            pass
        elif i[:1] == '@':
            pass
        elif i[:1] == '#':
            new_string = new_string.strip() + ' ' + i[1:]
        else:
            new_string = new_string.strip() + ' ' + i

    table = string.maketrans("","") # make a translation table
    new_string = re.sub("[^A-Za-z']+", ' ', new_string) # agressive and removes all non-alphanumeric (works only for latin-based and maybe only English)
    new_string = new_string.replace(" amp ", " ") # remove html code for ampersands($)
    new_string = new_string.lower() # lowercase entire tweet
    new_string = re.sub(r'(.)\1+', r'\1\1', new_string) # reduces any char repition of > 2 to 2.
    new_string = new_string.replace(query_term, " ") # take the original value used to collect tweets as a system argument, and remove it from tweets
    new_string = re.sub(r'(?<!\S)\S{1}(?!\S)', '', new_string)
    new_string = ' '.join(new_string.split()) # remove additional spaces

    return new_string

Ejemplo n.º 17

0

Mostrar archivo

    def writeout(self, igraph, out):
        
        char = chr(int(igraph['code'], 16))
        if char not in self.existing or char in self.seen:
            return

        definition = igraph.get('kDefinition', '')
        definition = regex.sub(r' U\+\w+', '', definition)

        phon = set()
        mn = igraph.get('kMandarin', None)
        hu = igraph.get('kHanyuPinlu', None)
        hn = igraph.get('kHanyuPinyin', None)
        if hn:
            hn = regex.sub(r'\d+\.\d+:', '', hn)
        if hu:
            hu = regex.sub(r'\(\d+\)', '', hu)
        for p in [mn, hu, hn]:
            if p:
                phon.update(regex.split(r'[, ]+', p))
        phon = ",".join(sorted(phon))

        if not phon:
            return
        
        if not self.first:
            out.write(',\n')
        else:
            self.first = False
        out.write('\'{}\': {}'.format(char, [phon, definition]))

Ejemplo n.º 18

0

Mostrar archivo

Archivo: utils.py Proyecto: eHealthAfrica/flows

def normalize_number(number, country_code):
    """
    Normalizes the passed in number, they should be only digits, some backends prepend + and
    maybe crazy users put in dashes or parentheses in the console.
    :param number: the number, e.g. "0783835665"
    :param country_code: the 2-letter country code, e.g. "RW"
    :return: a tuple of the normalized number and whether it looks like a possible full international number
    """
    # if the number ends with e11, then that is Excel corrupting it, remove it
    if number.lower().endswith("e+11") or number.lower().endswith("e+12"):
        number = number[0:-4].replace('.', '')

    # remove other characters
    number = regex.sub('[^0-9a-z\+]', '', number.lower(), regex.V0)

    # add on a plus if it looks like it could be a fully qualified number
    if len(number) >= 11 and number[0] != '+':
        number = '+' + number

    try:
        normalized = phonenumbers.parse(number, str(country_code) if country_code else None)

        # now does it look plausible?
        if phonenumbers.is_possible_number(normalized):
            return phonenumbers.format_number(normalized, phonenumbers.PhoneNumberFormat.E164), True
    except Exception:
        pass

    # this must be a local number of some kind, just lowercase and save
    return regex.sub('[^0-9a-z]', '', number.lower(), regex.V0), False

Ejemplo n.º 19

0

Mostrar archivo

Archivo: leipzig_preprocessor.py Proyecto: KshitijKarthick/tvecs

    def _clean_word(self, word):
        """
        Preprocess words after tokenizing words from sentences.

        - Remove apostrophes ['s, s'].
        - Bring to lowercase.
        - Remove punctuations.
        - Remove English words from Non-English corpus data.
        """
        if self.language is "english":
            regex = r"((\p{P}+)|(\p{S}+)|([0-9]+))"
        else:
            regex = r"((\p{P}+)|(\p{S}+)|([0-9]+)|([A-Za-z]))"
        # Handle Apostrophe's correctly you'll => you
        selected_word = re.match(pattern=u"(.*)['’].*?", string=word)
        # If selected word matches a word with apostrophe
        if selected_word is not None:
            word = selected_word.groups()[0]
        # Handle Pair words ice-cream => ice cream
        word = re.sub(pattern="-", repl=' ', string=word)
        return re.sub(
            pattern=regex,
            repl='',
            string=word.lower()
        ).strip().split()

Ejemplo n.º 20

0

Mostrar archivo

Archivo: data_processor.py Proyecto: amsqr/Kaggle_HomeDepot

 def transform(self, text):
     for pattern, replace in self.pattern_replace_pair_list:
         try:
             text = regex.sub(pattern, replace, text)
         except:
             pass
     return regex.sub(r"\s+", " ", text).strip()

Ejemplo n.º 21

0

Mostrar archivo

Archivo: findspam.py Proyecto: JC3/SmokeDetector

 def test_post(title, body, user_name, site, is_answer, body_is_summary):
     result = []
     for rule in FindSpam.rules:
         body_to_check = body
         if rule['stripcodeblocks']:
             body_to_check = regex.sub("<pre>.*?</pre>", "", body, flags=regex.DOTALL)
             body_to_check = regex.sub("<code>.*?</code>", "", body_to_check, flags=regex.DOTALL)
         if rule['all'] != (site in rule['sites']):
             matched_title = regex.compile(rule['regex'], regex.UNICODE).findall(title)
             matched_username = regex.compile(rule['regex'], regex.UNICODE).findall(user_name)
             matched_body = regex.compile(rule['regex'], regex.UNICODE).findall(body_to_check)
             if matched_title and rule['title']:
                 try:
                     if getattr(FindSpam, "%s" % rule['validation_method'])(matched_title):
                         result.append(rule['reason'])
                 except KeyError:  # There is no special logic for this rule
                     result.append(rule['reason'].replace("{}", "title"))
             if matched_username and rule['username']:
                 try:
                     if getattr(FindSpam, "%s" % rule['validation_method'])(matched_username):
                         result.append(rule['reason'])
                 except KeyError:  # There is no special logic for this rule
                     result.append(rule['reason'].replace("{}", "username"))
             if matched_body and rule['body'] and (not body_is_summary or rule['body_summary']):
                 type_of_post = "answer" if is_answer else "body"
                 try:
                     if getattr(FindSpam, "%s" % rule['validation_method'])(matched_body):
                         result.append(rule['reason'].replace("{}", type_of_post))
                 except KeyError:  # There is no special logic for this rule
                     result.append(rule['reason'].replace("{}", type_of_post))
     return result

Ejemplo n.º 22

0

Mostrar archivo

Archivo: default_filters.py Proyecto: ChillarAnand/plugins

    def wptexturize(self, text):
        # Transform into regexp sub-expression used in _wptexturize_pushpop_element
        # Must do this every time in case plugins use these filters in a context sensitive manner
        no_texturize_tags = '(' + '|'.join(self.default_no_texturize_tags) + ')'
        no_texturize_shortcodes = '(' + '|'.join(self.default_no_texturize_shortcodes) + ')'

        no_texturize_tags_stack = []
        no_texturize_shortcodes_stack = []

        # PHP: Since Python doesn't support PHP's /U modifier (which inverts quantifier's greediness), I modified the regular expression accordingly
        textarr = regex.split('(<.*?>|\[.*?\])', text, flags=regex.DOTALL)

        result = []
        for curl in textarr:
            if len(curl) == 0:
                continue

            # Only call _wptexturize_pushpop_element if first char is correct tag opening
            first = curl[0]
            if '<' == first:
                self.__wptexturize_pushpop_element(curl, no_texturize_tags_stack, no_texturize_tags, '<', '>')
            elif '[' == first:
                self.__wptexturize_pushpop_element(curl, no_texturize_shortcodes_stack, no_texturize_shortcodes, '[', ']')
            elif len(no_texturize_shortcodes_stack) == 0 and len(no_texturize_tags_stack) == 0:
                # This is not a tag, nor is the texturization disabled static strings
                for search, replacement in self.static:
                    curl = curl.replace(search, replacement)
                # regular expressions
                for search, replacement in self.dynamic:
                    curl = regex.sub(search, replacement, curl)
            curl = regex.sub('&([^#])(?![a-zA-Z1-4]{1,8};)', '&#038;\\1', curl)
            result.append(curl)
        return ''.join(result)

Ejemplo n.º 23

0

Mostrar archivo

Archivo: parse_index_and_version.py Proyecto: JonMosenkis/Sefaria-Project

    def parse_implied_depth(self, element):
        ja_depth_pattern = ur"\[(\d)\]$"
        ja_sections_pattern = ur"\[(.*)\]$"
        title_str = element.get('text').strip()

        depth_match = re.search(ja_depth_pattern, title_str)
        if depth_match:
            depth = int(depth_match.group(1))
            placeholder_sections = ['Volume', 'Chapter', 'Section', 'Paragraph']
            element.set('text', re.sub(ja_depth_pattern, "", title_str))
            return {'section_names': placeholder_sections[(-1 * depth):], 'address_types' : ['Integer'] * depth}

        sections_match = re.search(ja_sections_pattern, title_str)
        if sections_match:
            sections = [s.strip() for s in sections_match.group(1).split(",")]
            element.set('text', re.sub(ja_sections_pattern, "", title_str))
            section_names = []
            address_types = []
            for s in sections:
                tpl = s.split(":")
                section_names.append(tpl[0])
                address_types.append(tpl[1] if len(tpl) > 1 else 'Integer')

            return {'section_names': section_names, 'address_types' : address_types}
        else:
            return None

Ejemplo n.º 24

0

Mostrar archivo

Archivo: mixed_cyrl_latn_extra.py Proyecto: 2vitalik/words

    def after(self):
        order = [u'Roman', u'Wrong?', u'Okay?', u'Other']
        for key, data in sorted(self.content_mixed_cyrl_latn_extra.items(), key=lambda x: order.index(x[0])):
            content = u"""== Описание ==
Здесь представлены статьи, в которых присутствует смесь кириллицы и латиницы в содержимом.

Обсудить можно '''[[Обсуждение Викисловаря:Отчёты|здесь]]'''.

== Список результатов ==
"""
            items = sorted(data.items(), key=lambda x: x[0])
            for title, sub_items in items:
                content += u"# [[{0}]]\n".format(title)
                for value in sub_items:
                    value = \
                        regex.sub(u'(\p{IsLatin}+)',
                               u'<span style="background-color: #FFD0D0;">\g<1></span>',
                               value, flags=re.IGNORECASE | re.UNICODE)
                    value = \
                        regex.sub(u'(\p{IsCyrillic}+)',
                               u'<span style="background-color: #D0FFD0;">\g<1></span>',
                               value, flags=re.IGNORECASE | re.UNICODE)
                    content += u'#* <code>{}</code>\n'.format(value.replace('\n', ' ').strip())
            title = u'Ошибки/Содержимое/Ошибки/Смесь кириллицы и латиницы/Однобуквенные случаи/{}'.format(key)
            count = len(data)
            self.process_report(title, content, count)
        super(ContentMixedCyrlLatnExtra, self).after()

Ejemplo n.º 25

0

Mostrar archivo

Archivo: greek.py Proyecto: garykpdx/panlex-tools

    def remove_article(self, text):
        for art in self.articles:
            text = re.sub("^\s*\m%s\M\s*" % art, " ", text)

        text = re.sub("\s*\mο\M", "", text)
        text = re.sub("\s*\mείναι\M", "", text)
        return text.strip()

Ejemplo n.º 26

0

Mostrar archivo

Archivo: ids.py Proyecto: Murodese/pynab

def clean_name(name):
    """
    Cleans a show/movie name for searching.

    :param name: release name
    :return: cleaned name
    """

    name = unicodedata.normalize('NFKD', name)

    name = regex.sub('[._\-]', ' ', name)
    name = regex.sub('[\':!"#*’,()?]', '', name)
    name = regex.sub('\s{2,}', ' ', name)
    name = regex.sub('\[.*?\]', '', name)

    replace_chars = {
        '$': 's',
        '&': 'and',
        'ß': 'ss'
    }

    for k, v in replace_chars.items():
        name = name.replace(k, v)

    name = CLEANING_REGEX.sub('', name)

    return name.lower()

Ejemplo n.º 27

0

Mostrar archivo

Archivo: tinytokenizer.py Proyecto: bplank/multilingualtokenizer

def main():

    args = parser.parse_args()

    tt = TinyTokenizer()

    for line in open(args.infile):
        line=line.strip()

        out = tt.tokenize(line)
        outline = " ".join(out)
        try:
            assert(str(regex.sub(r"\s","",line))==str(regex.sub("\s","",outline)))
            if args.conll:
                for w in out:
                    print(w)
                print()
            else:
                print(outline)
            
        except:
            print("==== CHECK FILE! ====",  args.infile, file=sys.stderr)
            print("+"*20, file=sys.stderr)
            print("in:  >>{}<<".format(line), file=sys.stderr)
            print("out: >>{}<<".format(outline), file=sys.stderr)     
            print(str(regex.sub(r"\s","",line)), file=sys.stderr)
            print(str(regex.sub(r"\s","",outline)), file=sys.stderr)

Ejemplo n.º 28

0

Mostrar archivo

def fix_hyphens(word):
    for i in range(0, 2):
        word = regex.sub(r'-({})({})'.format(cons, cons), r'\1-\2', word, flags=regex.I)
        word = regex.sub(r'([kgcjḍṭdtpb])-(h{})'.format(vowel_pattern), r'\1\2-', word, flags=regex.I)
    word = regex.sub(r'^(\p{alpha}{0,3})-', r'\1', word)
    word = regex.sub(r'-(\p{alpha}{0,3})$', r'\1', word)
    return word

Ejemplo n.º 29

0

Mostrar archivo

Archivo: Parse_Index_and_Version.py Proyecto: rneiss/Sefaria-Project

def parse_text(element):
    n = element.attrib["_note"]
    n = re.sub(r"[/]", "<br>", n)
    n = re.sub(r"[(]", "<em><small>", n)
    n = re.sub(r"[)]", "</small></em>", n)
    prayer = n.strip().splitlines()
    return prayer

Ejemplo n.º 30

0

Mostrar archivo

Archivo: matcher.py Proyecto: larrykoubiak/gamedbpython

 def normalize(self, s):
     s = re.sub(":","",s) # subtitle :
     s = re.sub("-","",s) # subtitle -
     s = re.sub("  "," ",s) # remove double space
     s = re.sub("The ","",s) # remove prefix The      
     s = re.sub(", The","",s) # remove suffix ,The
     return s

Ejemplo n.º 31

0

Mostrar archivo

Archivo: clip.py Proyecto: yangboz/big-sleep

def whitespace_clean(text):
    text = re.sub(r'\s+', ' ', text)
    text = text.strip()
    return text

Ejemplo n.º 32

0

Mostrar archivo

def on_msg(msg, client):
    global _room_roles

    if not isinstance(msg, events.MessagePosted) and not isinstance(
            msg, events.MessageEdited):
        return

    message = msg.message
    room_ident = (client.host, message.room.id)
    room_data = _rooms[room_ident]

    if message.owner.id == client._br.user_id:
        if 'direct' in _room_roles and room_ident in _room_roles['direct']:
            SocketScience.receive(
                message.content_source.replace("\u200B",
                                               "").replace("\u200C", ""))

        return

    if message.content.startswith("<div class='partial'>"):
        message.content = message.content[21:]
        if message.content.endswith("</div>"):
            message.content = message.content[:-6]

    if message.parent:
        try:
            if message.parent.owner.id == client._br.user_id:
                strip_mention = regex.sub(
                    "^(<span class=(\"|')mention(\"|')>)?@.*?(</span>)? ", "",
                    message.content)
                cmd = GlobalVars.parser.unescape(strip_mention)

                result = dispatch_reply_command(message.parent, message, cmd)

                if result:
                    s = ":{}\n{}" if "\n" not in result and len(
                        result) >= 488 else ":{} {}"
                    _msg_queue.put((room_data, s.format(message.id,
                                                        result), None))
        except ValueError:
            pass
    elif message.content.lower().startswith("sd "):
        result = dispatch_shorthand_command(message)

        if result:
            s = ":{}\n{}" if "\n" not in result and len(
                result) >= 488 else ":{} {}"
            _msg_queue.put((room_data, s.format(message.id, result), None))
    elif message.content.startswith("!!/"):
        result = dispatch_command(message)

        if result:
            s = ":{}\n{}" if "\n" not in result and len(
                result) >= 488 else ":{} {}"
            _msg_queue.put((room_data, s.format(message.id, result), None))
    elif classes.feedback.FEEDBACK_REGEX.search(message.content) \
            and is_privileged(message.owner, message.room) and datahandling.last_feedbacked:
        ids, expires_in = datahandling.last_feedbacked

        if time.time() < expires_in:
            Tasks.do(metasmoke.Metasmoke.post_auto_comment,
                     message.content_source,
                     message.owner,
                     ids=ids)
    elif 'direct' in _room_roles and room_ident in _room_roles['direct']:
        SocketScience.receive(
            message.content_source.replace("\u200B", "").replace("\u200C", ""))

Ejemplo n.º 33

0

Mostrar archivo

Archivo: Preliminary_Results.py Proyecto: sarahhafez/aspect-based-try-uno

!unzip amazon-reviews-unlocked-mobile-phones.zip

#saving the data
with open("Amazon_Unlocked_Mobile.csv") as csv_file:
  csv_reader = csv.reader(csv_file)
  colnames = next(csv_reader)
  data = list(csv_reader)

#printing one sample point to see how it is saved like
print(random.sample(data,1))
#each entry consists of product name, brand, price (string), rating (in string), review, review votes.

"""# **Data Cleaning and Preprocessing - Overall Sentiment**"""

x=re.sub("[^a-zA-Z0-9\s]", "", re.sub("[,.&:-]"," ","matu6738,at&t,,3"))
print(x)

x.split()

#extracting initial reviews and ratings from the original data 
initial_reviews = []
ratings = []
review_vote = []  #could be useful later 

for x in data:
  ratings.append(int(x[3]))
  initial_reviews.append(re.sub("[^a-zA-Z0-9\s]", "", re.sub("[,&.:-]"," ",x[4].lower())))
  review_vote.append(x[5])

clean_vote = []

Ejemplo n.º 34

0

Mostrar archivo

Archivo: word_embeddings.py Proyecto: slee-lab/lbnlp

def number_to_substring(text, latex=False):
    return regex.sub("(\d*\.?\d+)", r'_\1', text) if latex else regex.sub(
        "(\d*\.?\d+)", r'<sub>\1</sub>', text)