def extract_seo_h1_tag(self): 'Get infos from seo_h1_tag' html = self.chrome.get_inner_html_by_id('seo_h1_tag') if html == None: # no seo_h1_tag? return None name = rsub('"[^"]*"', '', html) name = rsub('<[^>]*>', '', name) if name != '': account = {'name': name} else: account = {'name': 'undetected'} href = self.ct.search(' href="https://www\.facebook\.com/[^/?"]+', html) if href != None: account['type'] = 'pg' account['path'] = href[32:] account['link'] = href[7:] account['id'] = 'undetected' else: href = self.ct.search(' href="/groups/[^/?"]+', html) if href != None: account['type'] = 'groups' account['path'] = 'groups_' + href[15:] account['link'] = 'https://facebook.com' + href[7:] account['id'] = 'undetected' else: return None return account
def extract_targets(self, target): 'Extract paths (= URLs without ...instagram.com/) from given targets' l = [] # list for the target users (id or path) for i in self.ct.split(target): i = rsub('^.*instagram\.com/', '', i) i = rsub('/.*$', '', i) if i != '' and i != 'p': l.append(i) return l
def extract_paths(self, target): 'Extract facebook paths from target that might be urls' l = [] # list for the target users (id or path) for i in self.ct.split(target): i = rsub('^.*facebook.com/', '', i.rstrip('/')) i = rsub('&.*$', '', i) if i != '': l.append(i) return l
async def translate(message, client, arguments): #Cheking if input contains any arguments DO NOT TOUCH WAS PAIN try: popped = rsearch("--([a-zA-Z0-9])\w+", arguments).group() except AttributeError: google = quote(str(arguments)) language = translate_to_lang else: google = quote(str(rsub(r"--([a-zA-Z0-9])\w+", "", arguments))) language = popped[2:] #Creating and fetching link query = "https://translation.googleapis.com/language/translate/v2?key=%s&target=%s&q=%s" % ( google_api, language, google) response = loads(rget(query).text) # Trying to create message try: detectedlanguage = response["data"]["translations"][0][ "detectedSourceLanguage"] translatedtext = response["data"]["translations"][0]["translatedText"] letter = ":cloud: **| " + detectedlanguage.upper( ) + " -> " + language.upper() + " `" + translatedtext + "`**" # if can't create mesage rteturn error except KeyError: letter = ":cloud: **| Invalid language target!**" # sending message await client.send_message(message.channel, letter)
def get_profile_name(self, html): 'Extract name' m = rsearch('>[^<]+</a>', html) if m != None: return m.group()[1:-4] m = rsearch('>[^<]+<span[^>]*>[^<]+</span>[^<]*</a>', html) if m != None: return rsub('<[^>]+>', '', m.group()[1:-4]) return 'undetected'
def _conditionalize(self, args): part = args[0] next = [] segments = args[1:] for segment in segments: next.append(self._conditionalize(segment)) next = ', '.join(next) if next: next = ''.join([', ', next]); conditions = [] for name in rfind('[:*](\\w+)', part): conditions.append(''.join(['"', name, '" in parameters'])) if conditions: conditions = ' and '.join(conditions) else: conditions = 'true' return ''.join(['((', conditions, ') and "".join(["', rsub('[:*](\\w+)', '", parameters["\\1"], "', part), '"', next, ']) or "")'])
def singularize(cls, what): '''Singularizes english words (example: people => person, sheep => sheep, lines => line)''' for x in range(len(cls._uncountable) - 1, -1, -1): value = cls._uncountable[x][0] if value == what[-len(value):].lower(): return what for x in range(len(cls._irregular) - 1, -1, -1): key = cls._irregular[x][1] value = cls._irregular[x][0] if key == what[-len(key):].lower(): return what[:-len(key)] + value for x in range(len(cls._singular) - 1, -1, -1): key = cls._singular[x][0] value = cls._singular[x][1] if rsearch(key, what, I): return rsub(key, value, what, I) return what
def pluralize(cls, what): '''Pluralizes english words (example: person => people, news => news, post => posts)''' for x in range(len(cls._uncountable) - 1, -1, -1): value = cls._uncountable[x][0] if value == what[-len(value):].lower(): return what for x in range(len(cls._irregular) - 1, -1, -1): key = cls._irregular[x][0] value = cls._irregular[x][1] if key == what[-len(key):].lower(): return what[:-len(key)] + value for x in range(len(cls._plural) - 1, -1, -1): key = cls._plural[x][0] value = cls._plural[x][1] if rsearch(key, what, I): return rsub(key, value, what, I) return what
def href(self, html): 'Search href=' try: return rsub('&', '&', self.search(' href="[^"]+', html)[7:]) except: return None
def urlize(cls, what, delimiter='_'): '''Returns the sentense passed as a URL slug (example: what's goin' on out there? => what_s_goin_on_out_there)''' return rsub(delimiter + '+', delimiter, rsub('[^0-9a-z]', delimiter, cls.underscore(cls.unaccent(cls.latinize(what))))).strip(delimiter)
def underscore(cls, what): '''Underscores a camelized word (example: BlogPosts => blog_posts, MyDBConnector => my_db_connector)''' return rsub('([A-Z]+)', r'_\1', rsub('([A-Z]+)([A-Z][a-z])', r'_\1_\2', what)).lstrip('_').lower()
def __init__(self, route, **kwargs): self._route = route # set/generate name if 'name' in kwargs: self._name = kwargs.pop('name') # set common options if 'defaults' in kwargs: self._defaults = kwargs.pop('defaults') if 'constraints' in kwargs: self._constraints = kwargs.pop('constraints') if 'formats' in kwargs: self._formats = kwargs.pop('formats') if 'limits' in kwargs: self._limits = kwargs('limits') # everything else should be map then, unless we hav a map argument passed if 'map' in kwargs: self._map = kwargs.pop('map') else: self._map = kwargs # check name once again and try to set it against map if not self._name: if self._map: self._name = '_'.join([str(value) for value in self._map.values()]) else: self._name = id(self) # scan parameter names and parameter types (optional/required, single/multiple) l = len(route) x = paren = 0 while x < l: c = route[x] x += 1 if c == '(': paren += 1 elif c == ')': paren -= 1 elif c == ':' or c == '*': name = [] while x < l and route[x].isalnum(): name.append(route[x]) x += 1 name = ''.join(name) self._parameters.append(name) if not paren: self._required.append(name) if c == '*': self._multiple.append(name) # set defaults for action and format if 'action' in self._parameters and not 'action' in self._defaults: self._defaults['action'] = 'index' if 'format' in self._parameters and not 'format' in self._defaults: self._defaults['format'] = 'html' # build the regular expression with a regular expression (and a couple of string substitutions, though) self._regex = rsub('[:*](\\w+)', self._constraintize, route.replace('(', '(?:').replace(')', ')?').replace('.', '\\.')) # build the evaluated urlize code with nested expression and a couple of regular expression substitutions expr = nestedExpr('(', ')') parsed = expr.parseString(''.join(['(', route, ')']))[0] self._code = ''.join(['result = ', self._conditionalize(parsed).replace(', ""', '')])
def sub(x, y, z): return rsub(y, z, x)
def src(self, html): 'Get src=' try: return rsub('&', '&', self.search(' src="[^"]+', html)[6:]) except: return None
def retrieve(): database = connect('database.db') topics, feeds, documents, titles, descriptions = [[], [], [], [], []] links, datetimes, thumbnails, doc_topics = [[], [], [], []] ### GET DATABASE DATA for row in database.execute('SELECT * FROM topics;'): topics.append([row[0], str(row[1])]) for row in database.execute('SELECT fds_topic, fds_link FROM feeds;'): feeds.append([row[0], str(row[1])]) for row in database.execute( 'SELECT doc_id, doc_datetime, doc_link FROM documents'): documents.append([row[0], str(row[1]), str(row[2]), []]) for row2 in database.execute( 'SELECT tpd_topic FROM tpc_doc WHERE tpd_document = ' + str(row[0]) + ';'): documents[-1][3].append(row2[0]) ### GET RSS INFO for topic, link in feeds: html = urlopen(link).read() soup = BeautifulSoup(html) items = [item for item in soup.find_all('item')] for item in items: doc_topics.append(topic) if item.title is not None: title = item.title.findAll(text=True) if len(title) == 1: titles.append(title[0].encode('ascii', errors='ignore')) else: titles.append('') if item.description is not None: desc = item.description.findAll(text=True) if len(desc) == 1: descriptions.append(desc[0].encode('ascii', errors='ignore')) else: descriptions.append('') if item.guid is not None: link = item.guid.findAll(text=True) if len(link) == 1: links.append(link[0].encode('ascii', errors='ignore')) else: links.append('') if item.pubdate is not None: date = item.pubdate.findAll(text=True) if len(date) == 1: datetimes.append(date[0].encode('ascii', errors='ignore')) else: datetimes.append('') thumb = item.findChildren('media:thumbnail', {'width': '144'}) if len(thumb) == 1: thumbnails.append(thumb[0]['url'].encode('ascii', errors='ignore')) else: thumbnails.append('') ### GET DOCUMENTS new = 0 updated = 0 for index in range(len(titles)): print('(' + str(index + 1).ljust(4) + str(doc_topics[index]).ljust(2) + ')'), datetime = parser.parse(datetimes[index]) try: pos = [doc[2] for doc in documents].index(links[index]) except: refresh = 0 else: if doc_topics[index] not in documents[pos][3]: database.execute('INSERT INTO tpc_doc (tpd_topic, tpd_document) VALUES'+\ ' ('+str(doc_topics[index])+', '+str(documents[pos][0])+');') documents[pos][3].append(doc_topics[index]) database.commit() print('*'), if str(datetime) == str(documents[pos][1]): print('Unchanged Article') continue refresh = 1 not_article = ('VIDEO', 'AUDIO', 'In pictures', 'Your pictures') if titles[index].startswith(not_article): print('Not an Article') continue html = urlopen(links[index]).read() soup = BeautifulSoup(html) title = str(soup.title)[7:-8].decode('utf-8').encode('ascii', errors='ignore') temp = [ 'BBC News', 'BBC History', 'BBC Science', 'BBC Consumer', 'BBC Arts', 'BBC Nature' ] if any(i in title for i in temp): division = 'story-body' elif 'BBC Sport' in title: division = 'article' elif 'BBC - Capital' in title: division = 'description|story-body' else: print('Website not known') continue content = [ div for div in soup.find_all('div', {'class': rcompile(division)}) ] soup = BeautifulSoup(' '.join(list(map(str, content)))) paragraphs = [p for p in soup.findAll('p')] soup = BeautifulSoup(' '.join(list(map(str, paragraphs)))) [ p.extract() for p in soup.findAll('p') if str(p).startswith('<p><strong>') ] [ p.extract() for p in soup.findAll('p', {'class': rcompile('disclaimer|terms')}) ] text = soup.get_text().replace('\n', ' ').replace('\t', ' ').replace('\r', ' ') text = text.encode('ascii', errors='ignore') if text == '': print('Empty Text') continue rsub(' +', ' ', text) text = text.strip() text = '\n'.join([sentence for sentence in sent_tokenize(text)]) if refresh == 1: documents[pos][1] = str(datetime) database.execute('DELETE FROM entities WHERE ent_document = ' + str(documents[pos][0]) + ';') database.execute('UPDATE documents SET doc_processed = 0,'+\ ' doc_datetime = \''+str(datetime)+'\','+\ ' doc_thumbnail = \''+thumbnails[index]+'\','+\ ' doc_title = \''+titles[index].replace('\'','\'\'')+'\','+\ ' doc_description = \''+descriptions[index].replace('\'','\'\'')+'\','+\ ' doc_text = \''+text.replace('\'','\'\'')+'\''+\ ' WHERE doc_link = \''+links[index]+'\';') print('Update - ' + titles[index]) updated += 1 else: documents.append([ len(documents) + 1, datetime, links[index], [doc_topics[index]] ]) database.execute('INSERT INTO tpc_doc (tpd_topic, tpd_document) VALUES'+\ ' ('+str(doc_topics[index])+', '+str(documents[-1][0])+');') database.execute('INSERT INTO documents (doc_datetime, doc_link, doc_thumbnail,'+\ ' doc_title, doc_description, doc_text) VALUES (\''+\ str(datetime)+'\',\''+links[index]+'\',\''+thumbnails[index]+'\',\''+\ titles[index].replace('\'','\'\'')+'\',\''+\ descriptions[index].replace('\'','\'\'')+'\',\''+\ text.replace('\'','\'\'')+'\');') print('Insert - ' + titles[index]) new += 1 database.commit() print new, "new,", updated, "updated."
def retrieve(): database = connect('database.db') topics,feeds,documents,titles,descriptions = [[],[],[],[],[]] links,datetimes,thumbnails,doc_topics = [[],[],[],[]] ### GET DATABASE DATA for row in database.execute('SELECT * FROM topics;'): topics.append([row[0], str(row[1])]) for row in database.execute('SELECT fds_topic, fds_link FROM feeds;'): feeds.append([row[0],str(row[1])]) for row in database.execute('SELECT doc_id, doc_datetime, doc_link FROM documents'): documents.append([row[0],str(row[1]),str(row[2]),[]]) for row2 in database.execute('SELECT tpd_topic FROM tpc_doc WHERE tpd_document = '+str(row[0])+';'): documents[-1][3].append(row2[0]) ### GET RSS INFO for topic, link in feeds: html = urlopen(link).read() soup = BeautifulSoup(html) items = [item for item in soup.find_all('item')] for item in items: doc_topics.append(topic) if item.title is not None: title = item.title.findAll(text=True) if len(title) == 1: titles.append(title[0].encode('ascii',errors='ignore')) else: titles.append('') if item.description is not None: desc = item.description.findAll(text=True) if len(desc) == 1: descriptions.append(desc[0].encode('ascii',errors='ignore')) else: descriptions.append('') if item.guid is not None: link = item.guid.findAll(text=True) if len(link) == 1: links.append(link[0].encode('ascii',errors='ignore')) else: links.append('') if item.pubdate is not None: date = item.pubdate.findAll(text=True) if len(date) == 1: datetimes.append(date[0].encode('ascii',errors='ignore')) else: datetimes.append('') thumb = item.findChildren('media:thumbnail',{'width':'144'}) if len(thumb) == 1: thumbnails.append(thumb[0]['url'].encode('ascii',errors='ignore')) else: thumbnails.append('') ### GET DOCUMENTS new = 0 updated = 0 for index in range(len(titles)): print('('+str(index+1).ljust(4) + str(doc_topics[index]).ljust(2) + ')'), datetime = parser.parse(datetimes[index]) try: pos = [doc[2] for doc in documents].index(links[index]) except: refresh = 0 else: if doc_topics[index] not in documents[pos][3]: database.execute('INSERT INTO tpc_doc (tpd_topic, tpd_document) VALUES'+\ ' ('+str(doc_topics[index])+', '+str(documents[pos][0])+');') documents[pos][3].append(doc_topics[index]) database.commit() print('*'), if str(datetime) == str(documents[pos][1]): print('Unchanged Article') continue refresh = 1 not_article = ('VIDEO','AUDIO','In pictures','Your pictures') if titles[index].startswith(not_article): print('Not an Article') continue html = urlopen(links[index]).read() soup = BeautifulSoup(html) title = str(soup.title)[7:-8].decode('utf-8').encode('ascii',errors='ignore') temp = ['BBC News','BBC History','BBC Science','BBC Consumer','BBC Arts','BBC Nature'] if any(i in title for i in temp): division = 'story-body' elif 'BBC Sport' in title: division = 'article' elif 'BBC - Capital' in title: division = 'description|story-body' else: print('Website not known'); continue content = [div for div in soup.find_all('div',{'class':rcompile(division)})] soup = BeautifulSoup(' '.join(list(map(str,content)))) paragraphs = [p for p in soup.findAll('p')] soup = BeautifulSoup(' '.join(list(map(str,paragraphs)))) [p.extract() for p in soup.findAll('p') if str(p).startswith('<p><strong>')] [p.extract() for p in soup.findAll('p',{'class':rcompile('disclaimer|terms')})] text = soup.get_text().replace('\n',' ').replace('\t',' ').replace('\r',' ') text = text.encode('ascii', errors='ignore') if text == '': print('Empty Text') continue rsub(' +',' ',text) text = text.strip() text = '\n'.join([sentence for sentence in sent_tokenize(text)]) if refresh == 1: documents[pos][1] = str(datetime) database.execute('DELETE FROM entities WHERE ent_document = '+str(documents[pos][0])+';') database.execute('UPDATE documents SET doc_processed = 0,'+\ ' doc_datetime = \''+str(datetime)+'\','+\ ' doc_thumbnail = \''+thumbnails[index]+'\','+\ ' doc_title = \''+titles[index].replace('\'','\'\'')+'\','+\ ' doc_description = \''+descriptions[index].replace('\'','\'\'')+'\','+\ ' doc_text = \''+text.replace('\'','\'\'')+'\''+\ ' WHERE doc_link = \''+links[index]+'\';') print('Update - '+titles[index]) updated += 1 else: documents.append([len(documents)+1, datetime, links[index],[doc_topics[index]]]) database.execute('INSERT INTO tpc_doc (tpd_topic, tpd_document) VALUES'+\ ' ('+str(doc_topics[index])+', '+str(documents[-1][0])+');') database.execute('INSERT INTO documents (doc_datetime, doc_link, doc_thumbnail,'+\ ' doc_title, doc_description, doc_text) VALUES (\''+\ str(datetime)+'\',\''+links[index]+'\',\''+thumbnails[index]+'\',\''+\ titles[index].replace('\'','\'\'')+'\',\''+\ descriptions[index].replace('\'','\'\'')+'\',\''+\ text.replace('\'','\'\'')+'\');') print('Insert - '+titles[index]) new += 1 database.commit() print new,"new,", updated,"updated."
#!/bin/python import sys from re import sub as rsub import WordSub, DefaultSubs """TODO: Bug with multi-sentence pattern/that/template(s)""" if __name__ == "__main__": xml=['<?xml version=\'1.0\' encoding=\'ISO-8859-1\'?>','<aiml>'] that=rsub(r'<[a-zA-Z\/][^>]*>','',sys.argv[1].upper()) pattern=rsub(r'<[a-zA-Z\/][^>]*>','',sys.argv[2].upper()) template=rsub(r'<[a-zA-Z\/][^>]*>','',sys.argv[3]) subbers={} #Do aiml default substitutions for best match. subbers['gender']=WordSub.WordSub(DefaultSubs.defaultGender) subbers['person']=WordSub.WordSub(DefaultSubs.defaultPerson) subbers['person2']=WordSub.WordSub(DefaultSubs.defaultPerson2) subbers['normal']=WordSub.WordSub(DefaultSubs.defaultNormal) for sub in subbers: that=subbers[sub].sub(that) pattern=subbers[sub].sub(pattern) xml.append('<category>') xml.append("<pattern>%s</pattern>" % pattern) if not that=='': xml.append("<that>%s</that>" % that) xml.append("<template>%s</template>" % template) xml.append('</category>') xml.append('</aiml>') f=open('learning.aiml','w') f.write('\n'.join(xml)) f.flush()
def underscore(what): return rsub('([A-Z]+)', r'_\1', rsub('([A-Z]+)([A-Z][a-z])', r'_\1_\2', what)).lstrip('_').lower()