Ejemplo n.º 1
0
def regex_to_big_query(re):
    # Take care of encoding
    re = re.replace("\\", "\\\\").replace("'", "\\'")
    # No need for grouping
    re = re.replace("(?:", "(")
    str = "(" + "LENGTH(REGEXP_REPLACE(lower(message),"  + "'%s', '@'))"   % re + "-" \
        + "LENGTH(REGEXP_REPLACE(lower(message),"  + "'%s', ''))"  % re + ")"

    return str
Ejemplo n.º 2
0
def parse_xml():
    """Parse xml and print only content enclosed in <post> tags"""
    post = False
    for line in stdin:
        line = line.strip()
        if is_match(r'</post|<quote', line):
            post = False
        elif post and line:
            print replace(r'<img(.*?)/>|<a(.*?)</a>', '', line)
        elif is_match(r'<post|</quote', line):
            post = True
    def _update_imported_attachement_titles(self):
        """ Builds a pretty title from filename for attachments and later
        it saves them
            - Removes extension
            - Removes extra spaces
        """

        for record in self.imported_attachment_ids:
            name = record.name
            name = replace(r'^(.+)(\.[^.]*)$', r'\1', name, flags=UNICODE)
            name = replace(r'[ \-_]+', r' ', name, flags=UNICODE)
            record.write({'name' : name.title()})
Ejemplo n.º 4
0
    def _update_imported_attachement_titles(self):
        """ Builds a pretty title from filename for attachments and later
        it saves them
            - Removes extension
            - Removes extra spaces
        """

        for record in self.imported_attachment_ids:
            name = record.name
            name = replace(r'^(.+)(\.[^.]*)$', r'\1', name, flags=UNICODE)
            name = replace(r'[ \-_]+', r' ', name, flags=UNICODE)
            record.write({'name' : name.title()})
Ejemplo n.º 5
0
def handleURL(line):
    link, url, text = getURL(line)
    if link is None:
        return re.replace(line, "[url]", "")

    v["%u"] = url
    v["%v"] = text

    text = subTags("URLs", "url")
    line = re.replace(line, link, text)

    url = subTags("URLs", "url-attr")
    line = re.replace(line, "[url]", url)

    return line
Ejemplo n.º 6
0
def handleURL(line):
    link, url, text = getURL(line)
    if link is None:
        return re.replace(line, "[url]", "")

    v["%u"] = url
    v["%v"] = text

    text = subTags("URLs", "url")
    line = re.replace(line, link, text)

    url = subTags("URLs", "url-attr")
    line = re.replace(line, "[url]", url)

    return line
Ejemplo n.º 7
0
def eigthDayB():
    f = open("day8.txt", 'r')
    totalStringCharacters = 0
    totalCharactersEncoded = 0
    for line in f:
        line = line.rstrip()
        totalStringCharacters += len(line)
        print totalStringCharacters
        re.replace(line, "\\\\"", ")
        print re.findall('\\\\"', line)
        print re.findall('\\\\x', line)
        totalCharactersEncoded += len(line) + 4 + len(re.findall('\\\\"', line)) * 2 + len(re.findall('\\\\.', line))
        print totalCharactersEncoded
        print "\n"
    print totalCharactersEncoded - totalStringCharacters 
Ejemplo n.º 8
0
def call_google(params, payload):
    gcr = requests.post('https://vision.googleapis.com/v1/images:annotate',
                        params=params,
                        json=payload)
    try:
        gc_response = json.loads(gcr.content)['responses'][0]
        if gc_response['textAnnotations'][0]['locale'] == 'en':
            text = gc_response['textAnnotations'][0]['description']
            text = replace(r"(\r\n|\n|\r)", " ", text)
            if text == "":
                return text
            words = text.split(' ')
            valid = 0
            invalid = 0
            for word in words:
                if word in english:
                    valid += 1
                else:
                    invalid += 1
            if invalid >= valid:
                text = "We are not confident in the text transcription of this image. " + "This image contains the text: " + text + ". "
            else:
                text = "This image contains the text: " + text + ". "
        else:
            text = "This image contains text in a language our extension cannot understand. "
    except KeyError as e:
        text = ""
    return text
Ejemplo n.º 9
0
def save_image_result(driver):
    # type ={'type':'login'}
    image_loc = ('css selector', 'div#captcha>img#codeimg')
    # url = 'https://passport.douguo.com'
    get_src = driver.find_element(*image_loc)

    size = get_src.size
    print(size)
    #微信截图可以定位 x y
    geti = ImageGrab.grab((1250, 600, 1350, 631))

    base_dir = os.path.dirname(os.path.dirname(__file__))
    base_dir = str(base_dir)
    base_dir = base_dir.replace('\\', '/')
    base = base_dir.split('/test_case')[0]

    path = base + '/report/image/' + 'image.png'
    re = pytesseract.image_to_string(geti)
    print(re)
    # geti.save(path)
    # f = Image.open(path)

    # getimage = f.crop(ra)
    geti.save(path)
    # verify = Image.open(path)
    # result = pytesseract.image_to_string(verify)

    result = re.replace(' ', '').replace('\n', '').replace('\u3000', '')
    print(result)

    return result
def slice2(s):
    s = re.replace(s.lower(), '[^a-z]')
    res = []
    for i in range(len(s) - 1):
        tmp = s[i:i + 2]
        if ' ' not in tmp: res.append(tmp)
    return res
Ejemplo n.º 11
0
def power():
    InputLatex="x ^ 2"
    string = ""
    str = ""
    dictstr = ""
    Outcome = " "
    InputLatex.split(' ')
    print(InputLatex)
    match = re.match('sum_{.*?=.*?}\^{.*?}|prod_{.*?=.*?}\^{.*?}|int_{.*?}\^{.*?}', InputLatex)
    powerfuncmatch = "^"
    #powerfuncmatch = re.match('\\\\^', InputLatex)
    #basefuncmatch = re.match('\\\\_', InputLatex)
    # print(match)
    if(match):
        print("found")
    if (powerfuncmatch):
        #re.compile(powerfuncmatch)
        for equ in InputLatex:
            queue.append(equ)
            print(queue)
        if re.search(powerfuncmatch,InputLatex):
            InputLatex = re.replace(powerfuncmatch, 'to the power of', InputLatex)


    print("Now here")
    print(InputLatex)
Ejemplo n.º 12
0
    def make_formater(form):
        '''replaces the words in the form to the appropriate logging syntax and return the formmatter to be added'''

        for word in replace(r'[^\w]', ' ', form).split():
            if word in FORMATS.keys():
                form = form.replace(word, FORMATS[word])
        return logging.Formatter(form)
Ejemplo n.º 13
0
    def clean_list(self, pagename):
        a = input("1-follower/2-following:  ")

        def openfile(pagename):
            if a == "1":
                open_file = open(
                    "/root/Desktop/py/followerslist/" + pagename +
                    "followers.txt", "r+")
                read = open_file.read()
                return read
                open_file.close()
            if a == "2":
                open_file = open(
                    "/root/Desktop/py/following list/" + pagename +
                    "following.txt", "r+")
                read = open_file.read()
                return read
                open_file.close()

        read = openfile(pagename)
        re = read.replace("'https://www.instagram.com/", "")
        me = re.replace("/'", "")
        de = me.replace(" ", "\n")
        if a == "1":
            he = open(
                "/root/Desktop/py/clean/" + pagename + "followersclean.txt",
                "w+")
            he.write(de)
            he.close()
        elif a == "2":
            he = open(
                "/root/Desktop/py/clean/" + pagename + "followingclean.txt",
                "w+")
            he.write(de)
            he.close()
Ejemplo n.º 14
0
	def set_file_postfix(self, file, extension):
		'''
		Set the filename postfix
		'''
		postfix = self.options.name[0]

		return postfix and replace('.%s$' % extension, '%s.%s' % (postfix, extension), file)
def test_regex_match():
    datafile = replace('\.py$', '.json', __file__)
    with open(datafile, 'r') as f:
        data = json_load(f)

    for sample in data:
        assert RE_COLOR.match(sample) is not None
Ejemplo n.º 16
0
 def import_asset( self, id ):
     out = []
     queue = [ id ]
     while !empty( queue ):
         id = queue[0]
         if isset( self._assets[id] ) and self._assets[id][4] and !self._assets[id][5]: # enqueued but not loaded yet
             asset_def = self._assets[id]
             type = asset_def[0]
             id = asset_def[1]
             asset = asset_def[2]
             deps = asset_def[3]
             if !empty( deps ):
                 needs_deps = False
                 numdeps = len(deps)
                 for i=numdeps-1; i>=0; i--:
                     dep = deps[i]
                     if isset( self._assets[dep] ) and !self._assets[dep][5]:
                         self._assets[dep][4] = True # enqueued
                         needs_deps = True
                         array_unshift( queue, dep )
                 if needs_deps: continue
                 else: array_shift( queue )
             
             asset_def[5] = True # loaded
             
             # hook here
             ret = {}
             self.trigger("import-asset", [
                 # importer, id,      type,   asset
                 self, id, type, asset, ret
             ]).trigger("import-asset-"+id, [
                 # importer, id,      type,   asset
                 self, id, type, asset, ret
             ])
             
             if 'return' in ret:
                 out.append( ret['return'] )
             
             else:
                 is_style = 'styles' == type
                 is_script = 'scripts' == type
                 is_tpl = 'templates' == type
                 is_inlined = is_array(asset)
                 asset_id = re.replace( r'[\-.\/\\:]+', '_', id)
                 
                 if is_style:
                     out.append( ("<style id=\"importer-inline-style-{$asset_id}\" type=\"text/css\" media=\"all\">{$asset[0]}</style>") if is_inlined else ("<link id=\"importer-style-{$asset_id}\" type=\"text/css\" rel=\"stylesheet\" href=\""+self.path_url(asset)+"\" media=\"all\" />") )
                     
                 elif is_script:
                     out.append( ("<script id=\"importer-inline-script-{$asset_id}\" type=\"text/javascript\">/*<![CDATA[*/ {$asset[0]} /*]]>*/</script>") if is_inlined else ("<script id=\"importer-script-{$asset_id}\" type=\"text/javascript\" src=\""+self.path_url(asset)+"\"></script>") )
                     
                 elif is_tpl:
                     out.append( ("<script id=\"importer-inline-tpl-{$asset_id}\" type=\"text/x-tpl\">{$asset[0]}</script>") if is_inlined elsec("<script id=\"importer-inline-tpl-{$asset_id}\" type=\"text/x-tpl\">"+self.get(asset)+"</script>") )
                     
                 else:
                     out.append( asset[0] if is_inlined else self.get(asset) )
         else:
             array_shift( queue )
    def _get_cleared_text(self):
        """ Perform some operations to obtain a cleared text
            - Replace tabs with spaces en removes extra spaces
            - Replace extra simbols after lists elements
        """
        content = (self.content or '').strip()
        flags = MULTILINE | UNICODE

        # STEP 1: Remove tabs and extra spaces
        content = replace(r'[ \t]+', r' ', content, flags=flags)

        # STEP 2: Remove spaces from both line bounds
        content = replace(r'[ \t]+$', r'', content, flags=flags)
        content = replace(r'^[ \t]+', r'', content, flags=flags)

        # STEP 3: Replace CRLF by LF and remove duplicates
        content = replace(r'[\r\n]', r'\n', content, flags=flags)
        content = replace(r'[\n]{2,}', r'\n\n', content, flags=flags)

        # STEP 2: Update questions and answers numbering formats
        pattern = r'^([0-9]{1,10})[.\-)]+[ \t]+'
        content = replace(pattern, r'\1. ', content, flags=flags)
        pattern = r'^([a-zñA-ZÑ])[.\-)]+[ \t]+'
        content = replace(pattern, r'\1) ', content, flags=flags)

        return content
Ejemplo n.º 18
0
def escape_re(re):
    re = re.replace(".", "\\.")
    re = re.replace("(", "\\)")
    re = re.replace(")", "\\)")
    re = re.replace("|", "\\|")
    re = re.replace("^", "\\^")
    re = re.replace("*", "\\*")
    re = re.replace("+", "\\+")
    re = re.replace("?", "\\?")

    return re
Ejemplo n.º 19
0
def repl(re) :
	""" Cambia expresiones regulares de prosite a las del modulo re de python"""

	mal = ['.', '-', '{', '}', '(', ')', '<', '>', 'x', '>]'] # La de prosite
	bien = ['', '', '[^', ']', '{', '}', '^', '$', '[GAVLIMPFYWSCTNQDEKRH]', ']?$'] # La buena

	for i in range(len(mal)) :
		re = re.replace(mal[i], bien[i])

	return re
Ejemplo n.º 20
0
def getTemplates(args):
	if(not args.tokenFeatures):
		return [TokenString(allowOOV = True)]
	else:
		templates = []
		for name in args.featureTemplates.split(","):
			if(name == "tokenString"):
				templates.append(TokenString(allowOOV = True))
			elif(name == "isCap"):
				templates.append(Capitalized(allowOOV = False))
			elif(name == "isNumeric"):
				templates.append(IsNumeric(allowOOV = False))
			elif(re.match(r"Suffix-\d+",name)):
				num = re.replace(r"Suffix-","",name)
				templates.append(Suffix(int(num),allowOOV = True))
			elif(re.match(r"Prefix-\d+",name)):
				num = re.replace(r"Prefix-","",name)
				templates.append(Prefix(int(num),allowOOV = True))
		return templates
def test_regex():
    datafile = replace('\.py$', '.json', __file__)
    with open(datafile, 'r') as f:
        data = json_load(f)

    for expectation in data:
        match = RE_CONFLICTED_PACKAGES.match(expectation['output'])
        assert match is not None
        assert match.group('package1') == expectation['result'][0]
        assert match.group('package2') == expectation['result'][1]
Ejemplo n.º 22
0
def getTemplates(args):
    if (not args.tokenFeatures):
        return [TokenString(allowOOV=True)]
    else:
        templates = []
        for name in args.featureTemplates.split(","):
            if (name == "tokenString"):
                templates.append(TokenString(allowOOV=True))
            elif (name == "isCap"):
                templates.append(Capitalized(allowOOV=False))
            elif (name == "isNumeric"):
                templates.append(IsNumeric(allowOOV=False))
            elif (re.match(r"Suffix-\d+", name)):
                num = re.replace(r"Suffix-", "", name)
                templates.append(Suffix(int(num), allowOOV=True))
            elif (re.match(r"Prefix-\d+", name)):
                num = re.replace(r"Prefix-", "", name)
                templates.append(Prefix(int(num), allowOOV=True))
        return templates
Ejemplo n.º 23
0
def add_new_accounts():
    global CG_USERS

    CG_USERS.clear()
    with open(files["users"], "r") as f:
        CG_USERS = f.readlines()

    filter = lambda s: replace("^\ufeff", "", s)
    user = filter(CG_USERS[0]).split(":")

    return user[:2]
Ejemplo n.º 24
0
 def isMasterFromConfigFile(self, chat_id):
     if not hasattr(self, "master") or not self.master:
         return False
     if unicode(self.master).isnumeric():
         return unicode(chat_id) == unicode(self.master)
     else:
         with self.bot.database as conn:
             cur = conn.cursor()
             cur.execute("select username from telegram_uids where uid = ?", [chat_id])
             res = cur.fetchone()
             return res != None and unicode(res[0]) == unicode(re.replace(r'^@', '', self.master))
Ejemplo n.º 25
0
def tokenize(text, stop_list):
    a = text.lower().translate(string.maketrans("",""), string.punctuation + string.digits)
    b = re.replace(r'(\s+)', ' ', a)
    c = b.split(' ')
    ### This might be too simple (simply deletes punctuation, then elims
    ### double spaces)

    ### Maybe eliminate some words here?
    ### It seems like code blocks get grabbed sometimes, so elim those
    ### To keep or not to keep numbers?
    return c
Ejemplo n.º 26
0
def tokenize(text, stop_list):
    a = text.lower().translate(string.maketrans("", ""),
                               string.punctuation + string.digits)
    b = re.replace(r'(\s+)', ' ', a)
    c = b.split(' ')
    ### This might be too simple (simply deletes punctuation, then elims
    ### double spaces)

    ### Maybe eliminate some words here?
    ### It seems like code blocks get grabbed sometimes, so elim those
    ### To keep or not to keep numbers?
    return c
Ejemplo n.º 27
0
 def test_assert_setting_reddit_api_scope_format(self):
     """
     For the "scope" value, Reddit's API uses a slight deviation
     from the oAuth 2.0 specifications, which states scopes should
     be space-separated. Reddit uses a comma separated value. Here,
     verify that the setting actually uses a comma separated list
     and NOT a standard oAuth space separated list.
     """
     self.assertEqual(
         settings.OAUTH_REDDIT_SCOPE,
         re.replace(r"[, ]+", ",", settings.OAUTH_REDDIT_SCOPE),
     )
Ejemplo n.º 28
0
    def _get_table_condition(self, table, config):
        conds = []
        if table.extcondition:
            conds.append(
                re.replace(r"(?i)^\s*where\s+", table.extcondition, ""))
        if config.filter:
            conds.append(config.filter)

        if conds:
            return " where " + " and ".join("(%s)" % c for c in conds)
        else:
            return ""
Ejemplo n.º 29
0
 def test_assert_setting_reddit_api_scope_format(self):
     """
     For the "scope" value, Reddit's API uses a slight deviation
     from the oAuth 2.0 specifications, which states scopes should
     be space-separated. Reddit uses a comma separated value. Here,
     verify that the setting actually uses a comma separated list
     and NOT a standard oAuth space separated list.
     """
     self.assertEqual(
         settings.OAUTH_REDDIT_SCOPE,
         re.replace(r'[, ]+', ',', settings.OAUTH_REDDIT_SCOPE)
     )
Ejemplo n.º 30
0
def get_username():

    filter = lambda s: replace("^\ufeff", "", s)

    try:
        assert len(CG_USERS), "Not accounts to CG!"
        assert ":" in CG_USERS[0], "Not accounts to CG!"
        user = filter(CG_USERS[0])
        user = user.split(":")
        return user[:2]
    except IndexError:
        raise SystemExit("Not accounts to CG!")
Ejemplo n.º 31
0
 def isMasterFromConfigFile(self, chat_id):
     if not hasattr(self, "master") or not self.master:
         return False
     if unicode(self.master).isnumeric():
         return unicode(chat_id) == unicode(self.master)
     else:
         with self.bot.database as conn:
             cur = conn.cursor()
             cur.execute("select username from telegram_uids where uid = ?",
                         [chat_id])
             res = cur.fetchone()
             return res != None and unicode(res[0]) == unicode(
                 re.replace(r'^@', '', self.master))
Ejemplo n.º 32
0
def shoesize(size):
    if 'EU' in size:
        re = size.replace('EU', '')
    elif 'UK' in size:
        re = size.replace('UK', '')
    elif 'US' in size:
        re = size.replace('US', '')
    else:
        re = size
    if '.' in re:
        return re.replace('.', '')
    else:
        return re
Ejemplo n.º 33
0
    def convert(self, input, target = False):
        convert_from = convert_to = multiplier_from = multiplier_to = False
        unittext = string.strip(re.replace("[^a-zA-Z ]", "", input))
        amount = string.strip(re.replace("[^0-9.,]", "", input)).replace(",", ".")
        if amount == "":
            return False

        #first determine unit of input value
        unit = self.get_unit(unittext)
        if not unit:
            return False

        convert_from = unit[1]
        if not target:
            if convert_from.has_key("derivations"):
                convert_to = convert_from.derivations[0]
            else:
                convert_to = unit[2]
        else:
            convert_to = self.get_unit(target)
            
        if convert_to.has_key("derivations"):
            if inspect.isfunction(convert_from.ratio_inverse):
                conversion = convert_from.ratio_inverse(amount)
            else:
                conversion = amount / convert_from.ratio
        else:
            if inspect.isfunction(convert_to.ratio):
                conversion = convert_to.ratio(amount)
            else:
                conversion = amount * convert_to.ratio
        
        if convert_to.has_key("factor"):
            prefix = convert_to.factor
            amount = amount / units.multipliers[convert_to.factor]["factor"]
        else:
            prefix = ""
        
        return (amount, convert_from.symbol, conversion, convert_to.symbol)
Ejemplo n.º 34
0
 def action(self, state, turn, playerNum):
     print "Current State"
     print self.state
     userInput = raw_input('It\'s turn {}. Your Move, Player {}: '.format(
         turn, playerNum))
     # Cleaning input to standardized form
     userInput.replace(userInput, r"$[0-9]* $[0-9]*",
                       r"\1,\2")  # fix for higher dimension
     userInput = re.replace(userInput, r"\[\] ", "")
     position = userInput.split(',')
     for i in range(len(position)):
         position[i] = int(position[i])
     return position
Ejemplo n.º 35
0
def segment(all_the_text):
    re = ""
    relist = ""
    words = segmenter.seg(all_the_text)
    count = 0
    for w in words:

        if len(w) > 1 and w >= u'/u4e00' and w <= u'\u9fa5':
            re = re + " " + w
            count = count + 1
        if count % 100 == 0:
            re = re.replace("\n", " ")
            relist = relist + "\n" + re
            re = ""
            count = count + 1
    re = re.replace("\n", " ").replace("\r", " ")
    if len(relist) > 1 and len(re) > 40:
        relist = relist + "\n" + re
    elif len(re) > 40:
        relist = re
    relist = relist + "\n"
    relist = relist.replace("\r\n", "\n").replace("\n\n", "\n")

    return relist
Ejemplo n.º 36
0
    def clean_list(self, filename):
        def openfile(filename):
            open_file = open(
                "/root/Desktop/py/hemin/" + filename + "following.txt", "r+")
            read = open_file.read()
            return read
            open_file.close()

        read = openfile(filename)
        re = read.replace("'https://www.instagram.com/", "")
        me = re.replace("/'", "")
        de = me.replace(" ", "\n")
        he = open("/root/Desktop/py/clean/" + filename + "clean.txt", "w+")
        he.write(de)
        he.close()
Ejemplo n.º 37
0
def todo_complete(caldav_conn, args):
    if args.nocaldav:
        raise ValueError("No caldav connection, aborting")
    tasks = todo_select(caldav_conn, args)
    for task in tasks:
        if hasattr(task.instance.vtodo, 'rrule'):
            rrule = rrulestr(task.instance.vtodo.rrule.value)
            try:
                next = rrule.after(datetime.now())
            except TypeError:  ## pesky problem with comparition of timestamps with and without tzinfo
                next = rrule.after(datetime.now(tz=tzlocal.get_localzone()))
            if next:
                ## new_task is to be completed and we keep the original task open
                completed_task = task.copy()
                remaining_task = task

                ## the remaining task should have recurrence id set to next start time, and range THISANDFUTURE
                if hasattr(remaining_task.instance.vtodo, 'recurrence_id'):
                    del remaining_task.instance.vtodo.recurrence_id
                remaining_task.instance.vtodo.add('recurrence-id')
                remaining_task.instance.vtodo.recurrence_id.value = next  ## TODO: should be same type as dtstart (date or datetime)
                remaining_task.instance.vtodo.dtstart.value = next  ## TODO: should be same type as dtstart (date or datetime)
                remaining_task.instance.vtodo.recurrence_id.params['RANGE'] = [
                    'THISANDFUTURE'
                ]
                remaining_task.instance.vtodo.rrule
                remaining_task.save()

                ## the completed task should have recurrence id set to current time
                ## count in rrule should decrease
                if hasattr(completed_task.instance.vtodo, 'recurrence_id'):
                    del completed_task.instance.vtodo.recurrence_id
                completed_task.instance.vtodo.add('recurrence-id')
                completed_task.instance.vtodo.recurrence_id.value = datetime.now(
                )
                completed_task.instance.vtodo.dtstart.value = datetime.now()
                count_search = re.search(
                    'COUNT=(\d+)', completed_task.instance.vtodo.rrule.value)
                if count_search:
                    completed_task.instance.vtodo.rrule.value = re.replace(
                        'COUNT=(\d+)',
                        'COUNT=%d' % int(count_search.group(1)) - 1)
                completed_task.complete()

                continue

        task.complete()
def longest_word(string):
    """Finds the longest word in a given string"""
    if len(string) == 1:
        return string

    # split string and remove any non-alphanum chars
    words = [replace(r'[\W+]', '', word) for word in string.split(' ')]

    if len(words) == 1:
        return string

    longest = ''
    for word in words:
        if len(word) > len(longest):
            longest = word

    return longest
Ejemplo n.º 39
0
    def removeCommonWords(self):
        if self.channel == "Instagram":
            self.keywordList = ["인스타그램", "인스타", "팔로우", "맞팔", "인친", "셀스타그램", "그램", "스타"]
        elif self.channel == "Naver Blog":
            self.keywordList = ["포스팅", "블로그", "댓글", "이웃추가"]
        elif self.channel == "Twitter":
            self.keywordList = ["트윗", "RT", "트위터"]
        if self.channel == "Naver News":
            self.keywordList = ["없음", "헤럴드", "역필", "투데이", "머니", "코리아", "기자", "오마이", "구독", "연합", "채널", "네이버", "뉴시스",
                                "금지", "저작", "무단", "뉴스", "재배포"]
        else:
            self.keywordList = []

        # self.keywordList.append(self.keyword)

        for keyword in self.keywordList:
            self.text = re.replace(keyword, " ", self.text)
def longest_word(string):
    """Finds the longest word in a given string"""
    if len(string) == 1:
        return string

    # split string and remove any non-alphanum chars
    words = [replace(r'[\W+]', '', word) for word in string.split(' ')]

    if len(words) == 1:
        return string

    longest = ''
    for word in words:
        if len(word) > len(longest):
            longest = word

    return longest
Ejemplo n.º 41
0
def defragment(tokenized):
    """Converts token list to string
    
    Arguments:
        tokenized {list} -- Tokenized string
    
    Return:
        str -- Rebuilt string
    """

    title = ""
    for token in tokenized:
        if token in "([{":
            title += token
        elif token in "}])":
            title = title[:-1] + token + " "
        else:
            title += token + " "
    return replace(r"(?<=\d)( \/ )(?=\d)", "/", title[:-1])
Ejemplo n.º 42
0
def todo_complete(caldav_conn, args):
    if args.nocaldav:
        raise ValueError("No caldav connection, aborting")
    tasks = todo_select(caldav_conn, args)
    for task in tasks:
        if hasattr(task.instance.vtodo, 'rrule'):
            rrule = rrulestr(task.instance.vtodo.rrule.value)
            try:
                next = rrule.after(datetime.now())
            except TypeError: ## pesky problem with comparition of timestamps with and without tzinfo
                next = rrule.after(datetime.now(tz=tzlocal.get_localzone()))
            if next:
                ## new_task is to be completed and we keep the original task open
                completed_task = task.copy()
                remaining_task = task

                ## the remaining task should have recurrence id set to next start time, and range THISANDFUTURE
                if hasattr(remaining_task.instance.vtodo, 'recurrence_id'):
                    del remaining_task.instance.vtodo.recurrence_id
                remaining_task.instance.vtodo.add('recurrence-id')
                remaining_task.instance.vtodo.recurrence_id.value = next ## TODO: should be same type as dtstart (date or datetime)
                remaining_task.instance.vtodo.dtstart.value = next ## TODO: should be same type as dtstart (date or datetime)
                remaining_task.instance.vtodo.recurrence_id.params['RANGE'] = [ 'THISANDFUTURE' ]
                remaining_task.instance.vtodo.rrule
                remaining_task.save()

                ## the completed task should have recurrence id set to current time
                ## count in rrule should decrease
                if hasattr(completed_task.instance.vtodo, 'recurrence_id'):
                    del completed_task.instance.vtodo.recurrence_id
                completed_task.instance.vtodo.add('recurrence-id')
                completed_task.instance.vtodo.recurrence_id.value = datetime.now()
                completed_task.instance.vtodo.dtstart.value = datetime.now()
                count_search = re.search('COUNT=(\d+)', completed_task.instance.vtodo.rrule.value)
                if count_search:
                    completed_task.instance.vtodo.rrule.value = re.replace('COUNT=(\d+)', 'COUNT=%d' % int(count_search.group(1))-1)
                completed_task.complete()

                continue

        task.complete()
Ejemplo n.º 43
0
    def getMain(self, sentence):
        re = ''
        words, rely_id, relation = self.getLTPAnalysis(sentence)
        #hed = self.getHED(array)
        if 0 in rely_id:
            hed = rely_id.index(0)
            sbv = self.getWord(words, rely_id, relation, hed, 'SBV')  # 主语
            vob = self.getWord(words, rely_id, relation, hed, 'VOB')  # 宾语
            fob = self.getWord(words, rely_id, relation, hed, 'FOB')  # 后置宾语

            adv = self.getWord(words, rely_id, relation, hed, 'ADV')  # 定中
            pob = self.getWord(words, rely_id, relation, hed,
                               'POB')  # 介宾如果没有主语可做主语

            zhuWord = self.getFirstNotNone([sbv, pob])  # 最终主语
            weiWord = words[hed]  # 最终谓语
            binWord = self.getFirstNotNone([vob, fob, pob])  # 最终宾语

            re = '{} {} {}'.format(zhuWord, weiWord, binWord)

        return re.replace('None', ' ')
Ejemplo n.º 44
0
def _insert_value(line, value, type):
    """.. Insert value into line.
    
    Parameters
    ----------
    line : str
        Line of document to insert value.
    value : str
        Value to insert.
    type : str
        Formatting for value.

    Returns
    -------
    line : str
        Line of document with inserted value.
    """
    
    if (type == 'no change'):
        line = re.replace('\\\\?#\\\\?#\\\\?#', value, line)
    elif (type == 'round'):
        try:
            value = float(value)
        except:
            raise_from(CritError(messages.crit_error_not_float % value), None)

        digits = re.findall('\\\\?#([0-9]+)\\\\?#', line)[0]
        rounded_value = format(value, '.%sf' % digits)
        line = re.sub('(.*?)\\\\?#[0-9]+\\\\?#', r'\g<1>' + rounded_value, line)
    elif (type == 'comma + round'):
        try:
            value = float(value)
        except:
            raise_from(CritError(messages.crit_error_not_float % value), None)

        digits = re.findall('\\\\?#([0-9]+),\\\\?#', line)[0]
        rounded_value = format(value, ',.%sf' % digits)
        line = re.sub('(.*?)\\\\?#[0-9]+,\\\\?#', r'\g<1>' + rounded_value, line)

    return(line)
Ejemplo n.º 45
0
def query_naivebayes(model={}, text='', min_word_size=4, k=1):
    from decimal import Decimal, Context
    from math import log
    from re import sub as replace

    # cria contexto para o built-in Decimal
    context = Context()

    # processa texto para de ter apenas palavras significativas
    if type(text).__name__ == 'str':
        print('Processando texto, stemming, removendo stopwords ....\n\n')
        words = text_processing(replace(r'([^a-zA-Z ]+)', r'', text),
                                min_word_size=min_word_size)
    else:
        words = text

    #return model

    # cria vetor de probabilidades
    prob_topic = list()
    for topic in model.keys():

        #print("Calculando topico: %s" % topic)

        prob = context.ln(model[topic]['P_topic'])
        words_of_topic = model[topic]['P_wk'].keys()
        #print("ANTES %s" % words)
        for wk in words:
            #print("PROBLEMA %s %f %f" % (wk, model[topic]['P_wk'][wk], log(model[topic]['P_wk'][wk])))
            if not wk in words_of_topic:
                prob += context.ln(model[topic]['P_wk']['espuria'])
            else:
                prob += context.ln(model[topic]['P_wk'][wk])
        prob_topic.append({topic: prob})

    sorted_probs = sorted(prob_topic,
                          key=lambda k: tuple(k.values())[0],
                          reverse=True)

    return sorted_probs[0:k]
Ejemplo n.º 46
0
def query_naivebayes(model={}, text='', min_word_size=4, k=1):
	from decimal import Decimal, Context
	from math import log
	from re import sub as replace

	# cria contexto para o built-in Decimal
	context = Context()
	
	# processa texto para de ter apenas palavras significativas
	if type(text).__name__ == 'str':
		print('Processando texto, stemming, removendo stopwords ....\n\n')
		words = text_processing( replace(r'([^a-zA-Z ]+)', r'', text),
				min_word_size=min_word_size)
	else:
		words = text
	
	#return model

	# cria vetor de probabilidades 
	prob_topic = list()
	for topic in model.keys():
		
		#print("Calculando topico: %s" % topic)
		
		prob = context.ln(model[topic]['P_topic'])
		words_of_topic = model[topic]['P_wk'].keys()
		#print("ANTES %s" % words)
		for wk in words:
			#print("PROBLEMA %s %f %f" % (wk, model[topic]['P_wk'][wk], log(model[topic]['P_wk'][wk])))
			if not wk in words_of_topic:
				prob += context.ln( model[topic]['P_wk']['espuria'] )
			else:
				prob += context.ln( model[topic]['P_wk'][wk] )
		prob_topic.append( {topic: prob} )
	
	sorted_probs = sorted(prob_topic, key=lambda k: tuple(k.values())[0], 
							reverse=True)
	
	return sorted_probs[0:k]
Ejemplo n.º 47
0
def validate_query(query):

    query = re.sub(r'" *~', '"~', query)
    query = re.sub(r'~ *', '~', query)

    parts = query.split('"')

    if parts[0] == '':
        parts = parts[1:]

    in_phrase = False

    for part in parts:

        in_phrase = not in_phrase

        if '*' in part and in_phrase:
            raise ValueError(
                '* is not allowed in a phrase ({})'.format(part))
        elif part[0] == '~' and re.replace(r'^~[0-9]+', '', part) == part:
            raise ValueError(
                '~ must be followed by a number after a phrase ({})'.format(part))

    return query
    def _get_cleared_text(self):
        """ Perform some operations to obtain a cleared text
            - Replace tabs with spaces en removes extra spaces
            - Replace extra simbols after lists elements
        """
        content = (self.content or '').strip()
        flags = MULTILINE|UNICODE

        # STEP 1: Remove tabs and extra spaces
        content = replace(r'[ \t]+', r' ', content, flags=flags)

        # STEP 2: Remove spaces from both line bounds
        content = replace(r'[ \t]+$', r'', content, flags=flags)
        content = replace(r'^[ \t]+', r'', content, flags=flags)

        # STEP 3: Replace CRLF by LF and remove duplicates
        content = replace(r'[\r\n]', r'\n', content, flags=flags)
        content = replace(r'[\n]{2,}', r'\n\n', content, flags=flags)

        # STEP 2: Update questions and answers numbering formats
        content = replace(r'^([0-9]{1,10})[.\-)]+[ \t]+', r'\1. ', content, flags=flags)
        content = replace(r'^([a-zñA-ZÑ])[.\-)]+[ \t]+', r'\1) ', content, flags=flags)

        return content
Ejemplo n.º 49
0
	dest = os.getcwd()
	if len(parse.arguments) > 1:
		dest = os.path.join(dest, parse.arguments[2])
	
	if os.path.isdir(dest):
		dest = os.path.join(dest, parse.arguments[1].replace(tpl_suffix, '.php'))
	
	if dest[-4:] != '.php':
		dest += '.php'
	
	# check if destination doesn't exist or --force is on
	if os.path.isfile(dest) and not parse.options['force']:
		safe.quit('Destination file already exists. Use -f/--force to overwrite.', 1)

	print 'Copying {0} to {1}.'
	
	# copy file
	safe.catch(shutil.copy, (src, dest), 'Can\'t copy template ({0}) to {1}.')
	
	print 'Ok.'
	
	# replace title with that from command line
	if parse.options['title'] != default_title:
		from contextlib import nested
		import re
		with nested(safe.fopen(dest), safe.fopen(dest + '.tmp', 'w') as (s, d):
			for i in s:
				d.write(re.replace(r'($APPLICATION\\-\\>SetTitle\\(\\s*")[^)]("\\))', '$1{0}$2'.format(parse.options['title'])))
	
	
Ejemplo n.º 50
0
    def goto(self, axis_letter, offset=False):
        coordinate = float(getattr(
            self,
            'lcdWorkNumber' +
            axis_letter.upper()).value())
        format = "{0:." + str(conf.get('ui.lcd_precision')) + "f}"
        formatted_coord = format.format(coordinate)

        coordinate, ok = QtGui.QInputDialog.getText(
            self,
            'Goto',
            u'''
            Enter a new coordinate for the %s axis.
            \u2022 -20 or +20          absolute
            \u2022 --20 or ++20        relative
            \u2022 50%%                 percent of current position
            \u2022 ((here + 20) / 3.2) arithmetic
            ''' % axis_letter.upper(),
            QtGui.QLineEdit.Normal,
            formatted_coord)
        coordinate = str(coordinate).strip()

        if ok:
            # relative
            match = re.search(r'^(([\-\+])\2)[\d\.]+$', coordinate)
            if match is not None:
                self.logger.debug(coordinate)
                coordinate = re.sub(r'^([\-\+])\1', r'\1', coordinate)
                if coordinate == 0:
                    coordinate = abs(coordinate)
                self.logger.debug(coordinate)
                self.controller.move_axis(axis_letter, coordinate)
                return None

            # absolute
            match = re.search(r'^([\-\+])?[\d\.]+$', coordinate)
            if match is not None:
                self.controller.move_axis(axis_letter, coordinate, True)
                return None

            # percentage
            match = re.search(r'^([\d\.]+)\%$', coordinate)
            if match is not None:
                group = match.groups()
                if offset:
                    current_coordinate = getattr(
                        self, 'lcdWorkNumber' + axis_letter.upper()).value()
                else:
                    current_coordinate = getattr(
                        self, 'lcdMachNumber' + axis_letter.upper()).value()

                current_coordinate = (
                    current_coordinate * (float(group[0]) / 100))
                if current_coordinate == 0:
                    current_coordinate = abs(current_coordinate)

                self.controller.move_axis(
                    axis_letter, current_coordinate, True)
                return None

            # expresion
            match = re.search(r'^\(.+\)$', coordinate)
            if match is not None:
                if offset:
                    current_coordinate = getattr(
                        self, 'lcdWorkNumber' + axis_letter.upper()).value()
                else:
                    current_coordinate = getattr(
                        self, 'lcdMachNumber' + axis_letter.upper()).value()

                coordinate = re.replace(r'\bhere\b', current_coordinate)

                ns = {'__builtins__': None}
                new_coordinate = eval(coordinate, ns)

                if new_coordinate == 0:
                    new_coordinate = abs(new_coordinate)

                self.controller.move_axis(axis_letter, new_coordinate, True)
                return None
Ejemplo n.º 51
0
	"wouldn't've": "would not have",
	"y'all": "you all",
	"y'all'd": "you all would",
	"y'all'd've": "you all would have",
	"y'all're": "you all are",
	"y'all've": "you all have",
	"you'd": "you had / you would",
	"you'd've": "you would have",
	"you'll": "you shall / you will",
	"you'll've": "you shall have / you will have",
	"you're": "you are",
	"you've": "you have"
}
# cria lista com contracoes sem os apostrofos
from re import sub as replace
contractions_without_punc = replace(r'([^a-z ]+)', r'', ' '.join(contractions.keys())).split(' ')


##
# Funcao que cria o "modelo" naivebayes, uma estrutura 
# com as probabilidades calculadas
##
# texts			=> Lista com o nome dos arquivos a serem 
#					aprendidos, ou a estrutura de texto ja
# 					processada (estrutura retornada pela
#					funcao prepare_reuters_data)
#
##
# Retorna uma dicionario cujas chaves sao os topicos 
# encontrados, para cada chave ha outro dicionario associado
# contendo duas chaves, a probabilidade do topico em questao,
Ejemplo n.º 52
0
for i in range(len(items)):
	list_item = driver.find_element_by_xpath("//*[@id='rso']/li[@class='psli']["+str(i+1)+"]")
	item = driver.find_element_by_xpath("//*[@id='rso']/li[@class='psli']["+str(i+1)+"]/div[1]/div[2]/h3/a")
	item.click()
	driver.implicitly_wait(10)
	driver.find_element_by_xpath("//*[@id='rso']/li[@class='psli']["+str(i+1)+"]/following-sibling::li/div/div/div/div[2]/div[1]/a").click()#get product reviews

	try: # try clicking "get all product reviews"
		driver.find_element_by_xpath("//*[@id='reviews-by-others']/div[6]/a").click()
	except:
		pass
	#//*[@id="reviews-by-others"]/div[3]/div[2]/div[2]/div[3]/span

	_product_name = driver.find_element_by_xpath("//*[@id='product-name']").text
	_product_name = re.replace(" ","_");
	print "_product_name " + _product_name
	product_data = {"product_name":_product_name,
					"reviews":[]
					}
	print "product_data " + json.dumps(product_data)
	review_cnt = 0
	while True:
		try:
			
			fn = product_data["product_name"] + ".txt"
			fdir = dataFolder + "/" + fn
			reviews_cnt = len(driver.find_elements_by_xpath("//*[@id='reviews-by-others']/div[3]/div")) #//*[@id="reviews-by-others"]/div[3]/div[1]/div[2]/div[3]/span
			print 'reviews_cnt is ' + str(reviews_cnt)
			for j in range(reviews_cnt):
					#//*[@id="reviews-by-others"]/div[3]/div[6]/div[2]/div[3]/span
Ejemplo n.º 53
0
Archivo: main.py Proyecto: rahul1/fb1
def removeStopwords(text):
    return re.replace(stopwordPattern,"". text)
Ejemplo n.º 54
0
def prepare_reuters_data(file_name_list=list(), 
			min_word_size=4, process_text=True,
			text_processing_function=nltk_tokenizer):
	from bs4 import BeautifulSoup
	from re import sub as replace
	from datetime import datetime as date
	
	# tags usados na estruturacao da noticia
	record_tag = 'reuters'
	topics_tag = 'topics'
	topic_tag = 'd'
	text_tag = 'text'
	
	
	result = dict({'texts': list(), 'topics': dict(), 
					'text_to_topics':[]})
	index = 0
	
	for file_name in file_name_list:
		print(date.now().strftime('[%Y-%m-%d %H:%M:%S]') + 
				" - Processando arquivo: %s" % file_name)

		file = open(file_name, 'r', encoding='utf-8', 
					errors='ignore')
		xml = BeautifulSoup( file.read() ). \
				findAll(record_tag)
		
		print("\tNumero de registros a ser processado: %d" % 
				len(xml))

		for record in xml:
			topics = record.find(topics_tag). \
				findAll(topic_tag)

			# descarta noticias sem topico
			if len(topics) == 0:
				continue
			
			# adiciona o indice do texto atual na lista dos 
			# topicos presentes nessa noticia
			current_topics = []
			for topic in topics:
				if not topic.text in result['topics'].keys():
					result['topics'][topic.text] = list()
				result['topics'][topic.text].append(index)
				current_topics.append(topic.text)
			
			# dado um texto, i-esimo elemento da lista 
			# result['texts'], cria uma lista com os topicos
			# relacionados a esse texto
			result['text_to_topics'].append(current_topics)

			text = record.find(text_tag).text
			if process_text:
				# remove caracteres que nao letras e
				# espaco, tambem muda texto para caixa baixo
				# substitue qualquer espaco por apenas um
				text = replace(r'([\s]+)', r' ', text.lower())
				text = replace(r'([^a-zA-Z ]+)', r'', text)
				# aplica funcao processing_text_function
				text = text_processing_function(text=text, 
							min_size=min_word_size)
			result['texts'].append(text)
			
			#print(index)
			index += 1
	
	return result
Ejemplo n.º 55
0
import re
print "Hello, World!"

s = "s#$%^&plunk"
print re.replace('\W', '', s)
Ejemplo n.º 56
0
    def format_image(self, image):

        name = image["name"]

        # If inlining is turned on then we need to embed the image
        # into the generated output HTML file.
        if(self.m_inline == True):
            handle = open(name, "rb")
            data = base64.encodestring(handle.read())
            data = re.replace("\n", "", data)
            name = "data:image/jpeg;base64," + data
            handle.close()

        style = ""
        caption = ""
        href_start = ""
        href_end   = ""

        if(image.has_key("width")):
            style += "width:%s;" % image["width"]
        if(image.has_key("height")):
            style += "height:%s;" % image["height"]
        if(image.has_key("caption")):
            caption = image["caption"]

        if(image.has_key("href")):
            href_start = "<a style='text-decoration:none;' href='%s'>" % image["href"]
            href_end = "</a>"


        if(image.has_key("align") and (image["align"] == "center" or image["align"] == "right")):
            if(image["align"] == "center"):
            
                return """
%s
<center>
<table style='text-align:center;'>
    <tr><td><img src='%s' style=\"%s\"/></td></tr>
    <tr><td><b>%s</b></td></tr>
</table>
</center>
%s
""" % (href_start, name, style, caption, href_end)
            elif(image["align"] == "right"):
                return """
%s
<table style='text-align:center;float:right;'>
    <tr><td><img src='%s' style=\"%s\"/></td></tr>
    <tr><td><b>%s</b></td></tr>
</table>
%s
""" % (href_start, name, style, caption, href_end)
                

        else:
            return """
%s
<span style='display:inline;'>
<table style='display:inline;text-align:center;'>
    <tr><td><img src='%s' style=\"%s\"/></td></tr>
    <tr><td><b>%s</b></td></tr>
</table>
</span>
%s
""" % (href_start, name, style, caption, href_end)
Ejemplo n.º 57
0
def split_nasdaq(symbol):
    sym = re.replace(_symbol_delimiter_regex, '', symbol)
    return sym[:4], sym[4:]
Ejemplo n.º 58
0
def no_groups(re):
    return re.replace('(', '(?:').replace('(?:?', '(?')
Ejemplo n.º 59
0
def whitelist(value):
    return replace("[^\da-zA-Z\-_]", "", value)