Exemple #1
1
def getCategoryUrl(site="",url=""):
    catDb = openTable(tableName=global_setting['catTable'])
    r = session.get(url)
    if not r.text:
        return False

    soup = BeautifulSoup(r.text)
    for level1 in soup.select('.classify_books'):
        curLevel1 = level1.select('.classify_title')[0].text
        curLevel1 = re.sub('\s', '', curLevel1)
        for level2 in level1.select('.classify_kind'):
            curLevel2 = level2.select('.classify_kind_name')[0].text
            curLevel2 = re.sub('\s', '', curLevel2)
            for level3 in level2.select('ul li a'):
                #curLevel3 = re.sub('\s', '', level3.text)
                curLevel3 =  level3.text.strip()
                curlUrl = level3['href']
                retFind = re.findall(r'\/cp(.*)\.html',curlUrl)
                if retFind:
                    curCatID = retFind[0]
                    catType = 'book'
                else:
                    retFind = re.findall(r'\/cid(.*)\.html',curlUrl)
                    if retFind:
                        curCatID = retFind[0]
                        catType = 'nonbook'
                if retFind:
                    if catDb.find({'catId':curCatID}).count() >0:
                        logger.debug('catetogy %s exists,skip\n'%(curCatID))
                    else:
                        catDb.insert({'catId':curCatID,'level1':curLevel1, 'level2':curLevel2, 'level3':curLevel3, 'catUrl':curlUrl,'catType':catType, 'site':site})
    return True
Exemple #2
0
 def _sanitize(self, data):
     retv = ''
     if data.find('\x1b') != -1:
         tmp = filter(lambda x: x in string.printable, data)
         retv += re.sub('(\{|\}|\*|\%)', '', re.sub('\[[0-9\;]+m', '', tmp))
         return retv
     return data
Exemple #3
0
    def _clean_text(self, text):
        """ Cleans up text before we make it into an HTML tree:
            1. Nukes <![CDATA stuff.
            2. Nukes XML encoding declarations
            3. Replaces </br> with <br/>
            4. Nukes invalid bytes in input
            5. ?
        """
        # Remove <![CDATA because it causes breakage in lxml.
        text = re.sub(r"<!\[CDATA\[", u"", text)
        text = re.sub(r"\]\]>", u"", text)

        # Remove <?xml> declaration in Unicode objects, because it causes an error:
        # "ValueError: Unicode strings with encoding declaration are not supported."
        # Note that the error only occurs if the <?xml> tag has an "encoding"
        # attribute, but we remove it in all cases, as there's no downside to
        # removing it. This moves our encoding detection to chardet, rather than
        # lxml.
        if isinstance(text, unicode):
            text = re.sub(r"^\s*<\?xml\s+.*?\?>", "", text)

        # Fix </br>
        text = re.sub("</br>", "<br/>", text)

        # Fix invalid bytes (http://stackoverflow.com/questions/8733233/filtering-out-certain-bytes-in-python)
        text = re.sub(u"[^\u0020-\uD7FF\u0009\u000A\u000D\uE000-\uFFFD\u10000-\u10FFFF]+", "", text)

        return text
Exemple #4
0
    def _parse_productions(self):
        """
        Parse the current contents of the textwidget buffer, to create
        a list of productions.
        """
        productions = []

        # Get the text, normalize it, and split it into lines.
        text = self._textwidget.get('1.0', 'end')
        text = re.sub(self.ARROW, '->', text)
        text = re.sub('\t', ' ', text)
        lines = text.split('\n')

        # Convert each line to a CFG production
        for line in lines:
            line = line.strip()
            if line=='': continue
            productions += parse_cfg_production(line)
            #if line.strip() == '': continue
            #if not CFGEditor._PRODUCTION_RE.match(line):
            #    raise ValueError('Bad production string %r' % line)
            #
            #(lhs_str, rhs_str) = line.split('->')
            #lhs = Nonterminal(lhs_str.strip())
            #rhs = []
            #def parse_token(match, rhs=rhs):
            #    token = match.group()
            #    if token[0] in "'\"": rhs.append(token[1:-1])
            #    else: rhs.append(Nonterminal(token))
            #    return ''
            #CFGEditor._TOKEN_RE.sub(parse_token, rhs_str)
            #
            #productions.append(Production(lhs, *rhs))

        return productions
def file_num_sort(a, b):
  a_num = re.sub(r'[^0-9]+', '', a)
  b_num = re.sub(r'[^0-9]+', '', b)
  if a_num == '' or b_num == '':
    return cmp(a, b)
  else:
    return cmp(int(a_num), int(b_num))
Exemple #6
0
 def makeIdentifier(self, string):
   string = re.sub( r"\s+", " ", string.strip())
   string = unicodedata.normalize('NFKD', safeEncode(string))
   string = re.sub(r"['\"!?@#$&%^*\(\)_+\.,;:/]","", string)
   string = re.sub(r"[_ ]+","_", string)
   string = string.strip('_')
   return string.strip().lower()
Exemple #7
0
def shortcodify(name):
    # strip out all charcters that are not ascii
    nons = re.sub(r'[^\w\s]', "", name)
    # replae all whitespace with dash
    nonw = re.sub(r'\s', '-', nons)
    # return the lowercase version of the string
    return nonw.lower()
Exemple #8
0
    def GetFootnotes(self, doc, plainPrefix, prefix):
        chapterFootnotes = []
        for ppp in html.tostring(doc).split(r'<a name="P') :
            footnote = ppp.partition('"><b>')
            footnoteNo = footnote[0].partition('"')[0]
            verse = re.sub(r'^[^#]*#', '', footnote[0])
            if verse[0] != 'W' :
                continue
            footnoteText = re.sub(r'otworz\.php\?skrot=', r'#W', footnote[2].partition(' - ')[2]).strip()

            subs = (
            # remove trailing whitespaces
                (r'\s+<br>', r'<br>'),
            # change class name
                (r'skrot', r'przypis'),
            # fix href
                ('%20', ''),
                ('%C5%821', 'l'),
                ('%C5%822', 'L'),
            # one newline is enough
                (r'<br><br>', r'<br>'),
            # <div> tags were not open
                (r'<br></div>', r'<br>')
            )

            for fromPattern, toPattern in subs:
                footnoteText = re.sub(fromPattern, toPattern, footnoteText)

            verse = re.sub('W', ',', verse)
            chapterFootnotes.append('<a id="P' + plainPrefix + 'P' + footnoteNo + '" href="#W' + plainPrefix + verse + '" class="przypis"> [' + prefix + verse + ']</a> ' + footnoteText)
            #chapterFootnotes.append('<a id="' + plainPrefix + 'P' + footnoteNo + '" href="#' + plainPrefix + verse + '" class="przypis"> [' + plainPrefix + verse + ']</a> ' + footnoteText)

        self.footnotes.append("\n".join(chapterFootnotes))
Exemple #9
0
    def copy_template():
        config_prompt(template)
        shutil.copytree(template, name)

        if os.path.exists('%s/%s' % (name, 'config.yaml')):
            os.remove('%s/%s' % (name, 'config.yaml'))

        for dirname, dirnames, files in os.walk(name):
            for d in dirnames:
                if d == options.template:
                    shutil.copytree('%s/%s' % (dirname, d), '%s/%s' % (dirname, name))
                    shutil.rmtree('%s/%s' % (dirname, d))

        for dirname, dirnames, files in os.walk(name):
            for filename in files:
                f = open('%s/%s' % (dirname, filename), 'r')
                lines = f.readlines()
                f.close()

                first_pass = [re.sub('{{\s*(\w+)\s*}}', replace_variable, line) for line in lines]
                new_lines = [re.sub('__config_(\w+)__', replace_variable, line) for line in first_pass]

                f = open('%s/%s' % (dirname, filename), 'w')
                f.write(''.join(new_lines))
                f.close()
def clean_word(word):
    """Removes any potential non-word characters"""
    word = re.sub("[0-9]* ", "", word)
    word = re.sub("[\s]*", "", word)
    word = word.replace('\n', '')
    word = word.replace('\r', '')
    return word
def main():
    cur_dir = os.path.dirname(__file__)
    os.chdir(os.path.join(cur_dir, ".."))
    modules = sys.argv[1:]

    if not modules:
        modules = ['django_evolution']

    p = subprocess.Popen(['pyflakes'] + modules,
                         stderr=subprocess.PIPE,
                         stdout=subprocess.PIPE,
                         close_fds=True)

    contents = p.stdout.readlines()
    # Read in the exclusions file
    exclusions = {}
    fp = open(os.path.join(cur_dir, "pyflakes.exclude"), "r")

    for line in fp.readlines():
        if not line.startswith("#"):
            exclusions[line.rstrip()] = 1

    fp.close()

    # Now filter thin
    for line in contents:
        line = line.rstrip()
        test_line = re.sub(r':[0-9]+:', r':*:', line, 1)
        test_line = re.sub(r'line [0-9]+', r'line *', test_line)

        if test_line not in exclusions:
            print line
Exemple #12
0
def convert_corpus(filepath, mapping, alignment, begin="xxBeGiN142xx", end="xxEnD142xx"):
    general_corpus = ''
    with open(filepath, 'rb') as f:
        general_corpus = re.sub('(' + begin + '\W+)+', ' . ', f.read())
        general_corpus = re.sub('\n+', ' this_is_n3wline ', general_corpus)

    corpus = []
    for token in general_corpus.split():
        if token.strip() == '.':
            if len(corpus) > 0:
                if '\n' not in corpus[-1]:
                    # If the token is punctuation assign a random punctuation.
                    corpus[-1] = corpus[-1] + random.choice(['.', '.', '.' , ',', ',' ',', '!', '?'])
        elif token.strip() == 'this_is_n3wline':
            corpus[-1] = corpus[-1] + '.\n\n'
        elif alignment[token] in mapping:
            if  len(corpus) > 0 and re.search('[\n\.!?]',corpus[-1]):
                corpus.append(mapping[alignment[token]].capitalize().strip())
            else:
                corpus.append(mapping[alignment[token]].strip())
    corpus[0] = corpus[0].capitalize()
    output = ' '.join(corpus)
    output = re.sub(r' +', ' ', output)
    output = re.sub(r'\n+ ', '\n\n', output)
    return output
Exemple #13
0
def html_remove_image_history(doc):
    """
  Remove image history and links to information.
  """
    doc = re.sub(r"<h2>Image history</h2>[\s\S]+?</ul>", r"", doc)
    doc = re.sub(r"<h2>Image links</h2>[\s\S]+?</ul>", r"", doc)
    return doc
def chunkifier(conc_text, bytes, kwic=False, highlight=False):
    """Divides the passage in three:
    * from the beginning to the first hit (not included)
    * from the first hit to the end of the last hit
    * form the end of the last hit to the end of the passage
    Returns a tuple containing all three parts of the passage"""
    #conc_text = re.sub("[ \n\r]+\w*$", "", conc_text) ## no words cut out, or worse, no broken mutiple-byte chars
    conc_start = conc_text[:bytes[0]]
    conc_middle = ''
    end_byte = 0
    for pos, word_byte in enumerate(bytes):
        if highlight: 
            text, end_byte = highlighter(conc_text[word_byte:])
            end_byte = word_byte + end_byte
        else:
            text_chunks = re.split("([^ \.,;:?!\'\-\"\n\r\t\(\)]+)", conc_text[word_byte:])
            end_byte = word_byte + len(text_chunks[1])
            text = text_chunks[1]
        conc_middle += text
        if len(bytes) > pos+1:
            conc_middle += conc_text[end_byte:bytes[pos+1]]
    conc_end = conc_text[end_byte:]
    
    ## Make sure we have no words cut out
    conc_start = re.sub("^[^ ]+ ", "", conc_start)
    conc_end = re.sub(" [^ ]+$", "", conc_end)
    
    return conc_start, conc_middle, conc_end
Exemple #15
0
    def sendGPS(self, head, cmd):
        maxcnt = 100
        string = head + cmd
        res = ""
        print "GPS SEND: '%s'" % string

        self.dev.flushInput()
        self.dev.write(string + "\r\n")

        for j in xrange(maxcnt):
          res = self.dev.readline()
          if len(res) > 0:
            res = re.sub("^\s+", "", res)
            res = re.sub("\s+$", "", res)
            print "RAW GPS REPLY: '%s'" % res

            pos = res.find(head)
            if pos != -1:
              res = res[pos:].split("*")[0]
              print "GPS REPLY: '%s'" % res
              return res
          else:
            print "ZERO REPLY"
            return None

        print "sendGPS: FAILED: '%s'" % res
Exemple #16
0
def ToC(testament, books):
    url='http://biblia.deon.pl/index.php'
    response = urllib2.urlopen(url).read()
    doc = html.fromstring(response)
    for entry, href in zip(doc.xpath('.//tr[@valign="top"][' + testament + ']/td/a'), books):
        print re.sub(r'class=\"ks\" href=\".*?\"', r'href="#K' + unicodeToPlain(href) + r'"', html.tostring(entry)) + '<br>'
    print '<br><br>'
 def getPanelInfo(self, doc, strXPath):
     try:
         npos = doc.text_content().find(strXPath)
         if npos == -1:
             return ""
         strContent = doc.text_content()[npos:-1]
         npos = strContent.find("})")
         if npos == -1:
             return ""
         strContent = strContent[0:npos+1]
         strContent = (strContent[strContent.find("\"html\":\"")+8:-4])
         if "v2" in self.xpathType:
             strContent = strContent.decode('unicode-escape')
         strContent = re.sub(r"(\\n)*(\\t)*(\\ /)*(\\)*", "", strContent)
         strContent = re.sub(r"\\/", "/", strContent)
         if strContent:
             strContent = strContent.replace("&lt;", "<").replace("&gt;", ">").replace("nbsp;", "")
         else:
             return ""
     except Exception:
         s=sys.exc_info()
         msg = (u"getPanelInfo Error %s happened on line %d" % (s[1],s[2].tb_lineno))
         logger.error(msg)
         return ""
     return strContent
 def start(self):
     keyword = getattr(settings, 'USHAHIDI_KEYWORD', '')
     self.default_response = getattr(settings, 'USHAHIDI_RESPONSE', 'Thank you for your report.')
     self.error_response = getattr(settings, 'USHAHIDI_ERROR', "Due to some error, we're unable to process your message. Please resend.")
     self.pattern = re.compile(r"^\s*(?:%s)(?:[\s,;:]+(.+))?$" % (keyword))
     self.trigger_url = re.sub('\$\{sender_number\}', '%(sender)s', getattr(settings, 'USHAHIDI_TRIGGER_URL'))
     self.trigger_url = re.sub('\$\{message_content\}', '%(message)s', self.trigger_url)
Exemple #19
0
def gen_xkcd_sub(msg, hook=False):
    # http://xkcd.com/1288/
    substitutions = {
        'witnesses': 'these dudes I know',
        'allegedly': 'kinda probably',
        'new study': 'tumblr post',
        'rebuild': 'avenge',
        'space': 'SPAAAAAACCCEEEEE',
        'google glass': 'virtual boy',
        'smartphone': 'pokedex',
        'electric': 'atomic',
        'senator': 'elf-lord',
        'car': 'cat',
        'election': 'eating contest',
        'congressional leaders': 'river spirits',
        'homeland security': 'homestar runner',
        'could not be reached for comment': 'is guilty and everyone knows it'
    }
    # http://xkcd.com/1031/
    substitutions['keyboard'] = 'leopard'
    # http://xkcd.com/1418/
    substitutions['force'] = 'horse'
    output = msg
    if not hook or random() < 0.001 or True:
        for text, replacement in substitutions.items():
            if text in output:
                output = re.sub(r"\b%s\b" % text, replacement, output)

    output = re.sub(r'(.*)(?:-ass )(.*)', r'\1 ass-\2', output)
    if msg == output:
        return None if hook else msg
    else:
        return output
Exemple #20
0
def injection_test(payload, http_request_method, url):
                      
  # Check if defined method is GET (Default).
  if http_request_method == "GET":
    
    # Check if its not specified the 'INJECT_HERE' tag
    #url = parameters.do_GET_check(url)
    
    # Encoding spaces.
    payload = payload.replace(" ","%20")
    
    # Define the vulnerable parameter
    vuln_parameter = parameters.vuln_GET_param(url)
    
    target = re.sub(settings.INJECT_TAG, payload, url)
    request = urllib2.Request(target)
    
    # Check if defined extra headers.
    headers.do_check(request)
    
    try:
      # Get the response of the request
      response = get_request_response(request)
    except KeyboardInterrupt:
      response = None

  # Check if defined method is POST.
  else:
    parameter = menu.options.data
    parameter = urllib2.unquote(parameter)
    
    # Check if its not specified the 'INJECT_HERE' tag
    parameter = parameters.do_POST_check(parameter)

    # Define the POST data  
    if settings.IS_JSON == False:
      data = re.sub(settings.INJECT_TAG, payload, parameter)
      request = urllib2.Request(url, data)
    else:
      payload = payload.replace("\"", "\\\"")
      data = re.sub(settings.INJECT_TAG, urllib.unquote(payload), parameter)
      try:
        data = json.loads(data, strict = False)
      except:
        pass
      request = urllib2.Request(url, json.dumps(data))

    # Check if defined extra headers.
    headers.do_check(request)

    # Define the vulnerable parameter
    vuln_parameter = parameters.vuln_POST_param(parameter, url)
    
    try:
      # Get the response of the request
      response = get_request_response(request)
    except KeyboardInterrupt:
      response = None

  return response, vuln_parameter
Exemple #21
0
def extract_bow_v2_features(train, test, test_contains_labels = False):
    '''
    Performs feature extraction for another simple tfidf model used for 
    ensembling purposes.
    '''
    s_data = []
    s_labels = []
    t_data = []
    t_labels = []
    stemmer = PorterStemmer()    
    
    for i, row in train.iterrows():
        s=(" ").join(["q"+ z for z in BeautifulSoup(train["search_term"][i], "lxml").get_text(" ").split(" ")]) + " " + (" ").join(["z"+ z for z in BeautifulSoup(train.product_title[i], "lxml").get_text(" ").split(" ")]) + " " + BeautifulSoup(train.product_description[i], "lxml").get_text(" ")
        s=re.sub("[^a-zA-Z0-9]"," ", s)
        s= (" ").join([stemmer.stem(z) for z in s.split(" ")])
        s_data.append(s)
        s_labels.append(str(train["relevance"][i]))
    for i, row in test.iterrows():
        s=(" ").join(["q"+ z for z in BeautifulSoup(test["search_term"][i], "lxml").get_text().split(" ")]) + " " + (" ").join(["z"+ z for z in BeautifulSoup(test.product_title[i], "lxml").get_text().split(" ")]) + " " + BeautifulSoup(test.product_description[i], "lxml").get_text()
        s=re.sub("[^a-zA-Z0-9]"," ", s)
        s= (" ").join([stemmer.stem(z) for z in s.split(" ")])
        t_data.append(s)
        if test_contains_labels:
            t_labels.append(str(test["relevance"][i]))
            
    return (s_data, s_labels, t_data, t_labels)
Exemple #22
0
def parse_profile(file_name):
    return_dict = dict()
    with open(file_name) as reader:
        for line in reader.readlines():
            line = re.sub(r"export\s+", "", line.strip())
            if "=" in line:
                key, value = line.split("=", 1)
                # Values that are wrapped in tics:  remove the tics but otherwise leave as is
                if value.startswith(TIC):
                    # Remove first tic and everything after the last tic
                    last_tic_position = value.rindex(TIC)
                    value = value[1:last_tic_position]
                    return_dict[key] = value
                    continue
                # Values that are wrapped in quotes:  remove the quotes and optional trailing comment
                elif value.startswith(QUOTE): # Values that are wrapped quotes
                    value = re.sub(r'^"(.+?)".+', '\g<1>', value)
                # Values that are followed by whitespace or comments:  remove the whitespace and/or comments
                else:
                    value = re.sub(r'(#|\s+).*', '', value)
                for variable in re.findall(r"\$\{?\w+\}?", value):
                    # Find embedded shell variables
                    dict_key = variable.strip("${}")
                    # Replace them with their values
                    value = value.replace(variable, return_dict.get(dict_key, ""))
                # Add this key to the dictionary
                return_dict[key] = value
    return return_dict
 def htmlify (self, text):
     t=text.strip()
     #t=xml.sax.saxutils.escape(t)
     t="<p>%s</p>"%t
     t=re.sub('\n\n+','</p><p>',t)
     t=re.sub('\n','<br>',t)
     return t
Exemple #24
0
def main(argv):
    (type, address, action, key, value) = parse_cli()

    base_url = ''
    address = re.sub('http://', '', address)
    if type == 'etcd':
        base_url = address + '/v2/keys/'
    elif type == 'consul':
        base_url = address + '/v1/kv/'

    base_url = re.sub('\/+', '/', base_url)

    base_url = 'http://' + base_url
    base_url = re.sub('\/+$', '', base_url)

    if action.lower() == 'set':
        set_key_value(base_url, key, value)
    elif action.lower() == 'get':
        value = parse_value(get_key_value(base_url, key), type)

        if value is not None:
            print value

    elif action.lower() == 'delete':
        delete_key_value(base_url, key)
def classifyText( text, params ):
	start_time = params.my_time()
	#clean
	try: text = params.cleaner.clean_html( text )
	except: pass

	text = re.sub('<.*?>', ' ', text )
	text = re.sub('\s+', ' ', text )
	text = text.lower()

	#Tokenize
	tokens = re.findall('[a-z]+', text )

	#Remove stop words
	tokens_2 = []
	for t in tokens:
		if( not t in params.stopword_list ): tokens_2.append(t)

#	print tokens_2

	#Stem
	stems = []
	for t in tokens_2:
		stem = params.porterStemmer.stem( t, 0, len(t)-1 )
		stems.append(stem)

	z = 0#params.linear_classifier['{{intercept}}']+.6
	for s in stems:
		if s in params.linear_classifier:
#			print s, params.linear_classifier[s]
			z += params.linear_classifier[s]

	end_time = params.my_time()
	return ( z<0, [start_time, end_time, len(stems), z, 1/(1+math.exp(-z)), int(z>0)] )
Exemple #26
0
 def parse_list(self, page):
     # Remove null bytes
     page = re.sub(r'\0', r' ', page)
     # Remove sequences of '''''''
     page = re.sub(r"'+", "'", page)
     reader = csv.DictReader(StringIO(page), quoting=csv.QUOTE_ALL, escapechar='\\')
     # There is one row in the data for each violation, not just each
     # inspection. Violations from the same inspection will be contiguous,
     # so roll up the violations until we see a different inspection.
     current_record = None
     for row in reader:
         if row['CITY'] != 'CHARLOTTE':
             continue
         row['comments'] = []
         # Strip any leading zeros. Both 01 and 1 appear sometimes, but
         # they mean the same thing.
         item_id = row['ITEM_NUM'].lstrip('0')
         violation = {'id': item_id, 'value': row['ITEM_VALUE'], 'comment': row['COMMENT']}
         if current_record is None:
             current_record = row
             current_record['violation'] = [violation]
         elif current_record['FAC_NAME'] != row['FAC_NAME'] or current_record['DATE'] != row['DATE']:
             yield current_record
             current_record = row
             current_record['violation'] = [violation]
         else:
             current_record['violation'].append(violation)
     # The final record won't be yielded from the loop above because it has
     # no following record to trigger it, so yield it here.
     yield current_record
Exemple #27
0
def obfuscate_codeblocks(source):
  """Method for obfuscating codeblocks contents.

  It can be often useful to temporarly obfuscate codeblocks contents for performing safely some tasks
  and then re-introducing them.

  Parameters
  ----------
  source : str
    string (as single stream) containing the source

  Returns
  -------
  protected_contents : list
    list of str containing the contents of codeblocks
  str
    source with codeblocks contents obfuscated and replaced by a safe placeholder

  >>> source = '``` my code block ``` other contents'
  >>> prot, ob_source = obfuscate_codeblocks(source)
  >>> prot[0][2]
  '``` my code block ```'
  >>> ob_source
  '$PROTECTED-1 other contents'
  """
  obfuscate_source = source
  protected_contents = []
  for match in re.finditer(__regex_codeblock__,obfuscate_source):
    protected_contents.append([match.start(),match.end(),match.group()])
    obfuscate_source = re.sub(__regex_codeblock__,'$PROTECTED-'+str(len(protected_contents)),obfuscate_source,1)
  for match in re.finditer(__regex_codeblock_html__,obfuscate_source):
    protected_contents.append([match.start(),match.end(),match.group()])
    obfuscate_source = re.sub(__regex_codeblock_html__,'$PROTECTED-'+str(len(protected_contents)),obfuscate_source,1)
  return protected_contents,obfuscate_source
Exemple #28
0
    def GetBook(self, book):
        self.footnotes=[]
        self.content=[]
        counter = 1
        plainBook = unicodeToPlain(book)
        while True:
            url='http://www.biblia.deon.pl/otworz.php'
            values={'ksiega': book.encode('iso8859_2'),
              'rozdzial': str(counter)}
            data=urllib.urlencode(values)
            response = urllib2.urlopen(urllib2.Request(url, data)).read()
            doc = html.fromstring(response)

            if counter == 1:
                BookTitle = (doc.findall('.//span[@style="font-size:22px;"]')[0])
                self.content.append(re.sub(r'</span>', r'</div>', re.sub(r'<span style=\"font-size:22px;\"',r'<br><br><a name="K' + plainBook + r'"></a><div class="tytul"', html.tostring(BookTitle))))
                ChaptersInBook = len(doc.findall('.//select[@name="rozdzial"]/option'))
            else:
                self.content.append('<br><br>')

            plainPrefix = plainBook + str(counter)
            self.content.append('<div class="numer">' + str(counter) + '</div>')
            Book.GetContent(self, doc.xpath('//div[@class="tresc"]')[0], plainPrefix)
            Book.GetFootnotes(self, doc.xpath('//td[@width="150"]/table/tr[5]/td/div[1]')[0], plainPrefix, unicodeToReference(book) + ' ' + str(counter))

            if counter == ChaptersInBook:
                self.content.append('<br><br>' + "".join(self.footnotes))
                break
            counter += 1
Exemple #29
0
    def sendCmd(self, cmd, trg=None, det=None, val=None, shouldBeAnswer=True):
        string = str(cmd)
        if (trg != None):
            string = string + "," + str(trg)
        if (det != None):
            string = string + "," + str(det)
        if (val != None):
            string = string + "," + str(val)

        self.dev.write(string + "\r\n")
        res = self.dev.readline()
        res = re.sub("^\s+", "", res)
        res = re.sub("\s+$", "", res)
        if (not shouldBeAnswer) and len(res) == 0:
            print "CMD='%s' NORET" % (string)
            return 0

        print "CMD='%s' RET='%s'" % (string, res)
        if res == 'SYNTAX ERROR':
          #sometimes you get a 'SYNTAX ERROR' while the command will succeed if you try again
          if self.retries <= self.max_retries:
            self.retries=self.retries+1
            time.sleep(0.1)
            return self.sendCmd(cmd, trg, det, val, shouldBeAnswer)
          else:
            print "IO ERROR: max_retries reached for command ", cmd
        else:
          self.retries=0

        return res
 def __init__(self, responseObj=None, arguments=None):
     super(MXOptionResponse, self).__init__(responseObj=responseObj, arguments=arguments)
     htmlTree = html.fromstring(self.getContentAsText())
     kwargs = { }
     for tr in htmlTree.xpath('//div[@id="quotes"]/section/section/table/tbody/tr'):
         td = tr.xpath('./td')
         #For each header in the table row
         for i, th in enumerate(tr.xpath('./th')):
             th_text = th.text.strip().lower() if th.text is not None else ''
             #If it is a mapped name
             if th_text in self._MAPPING:
                 #Throw exception if no value is present
                 if len(td) <= i:
                     raise Exception('Failed to match value for "%s" using index %d"' % (th_text, i))
                 #Assume all values are float, trim any spacing or symbols
                 #TODO fix me
                 if td[i].text == '--':
                     td[i].text = '-1'
                 kwargs[self._MAPPING[th_text]] = float(re.sub('\s*([-+]?(?:\d*[.])?\d+).*', '\g<1>', td[i].text))
     self._instrument = arguments['instrument']
     #TODO Retrieve this from the HTML? We can probably stick with hardcoding from the name
     optionType = 'CALL' if 'C' in self._instrument.upper() else 'PUT'
     kwargs['_strikePrice'] = float(re.sub('\w+\s+[0-9]+(?:C|P)([-+]?(?:\d*[.])?\d+)', '\g<1>', self._instrument))
     #TODO Retrieve this from the HTML?
     expirationStr = re.sub('\w+\s+(\d+)(?:C|P).*', '\g<1>', self._instrument)
     kwargs['_expirationSec'] = time.mktime(time.strptime(expirationStr, '%y%m%d'))
     self._option = MXOption(optionType=optionType, **kwargs)
def clean_item_data(obj):
    if not obj:
        return ""
    data = obj.text
    data = re.sub(r".*?:", "", data) 
    return data.strip()
Exemple #32
0
def handle_str(string):
    string = re.sub(' ', '&nbsp;', string)
    string = re.sub('\r\n', '<br/>', string, re.S)
    return '内容:%s' % string
#!C:\Users\aats\PycharmProjects\SU_python_fundamentals\phone_book_console_app\venv\Scripts\python.exe
# EASY-INSTALL-ENTRY-SCRIPT: 'pip==19.0.3','console_scripts','pip3.7'
__requires__ = 'pip==19.0.3'
import re
import sys
from pkg_resources import load_entry_point

if __name__ == '__main__':
    sys.argv[0] = re.sub(r'(-script\.pyw?|\.exe)?$', '', sys.argv[0])
    sys.exit(load_entry_point('pip==19.0.3', 'console_scripts', 'pip3.7')())