def get_ticker_article_age( sym ): sym_url = 'http://seekingalpha.com/symbol/' + sym.lower() html = get_pretty_html( sym_url ) begin = '<a href="/author/zetakap">' end = '</li>' author_content = find_between( html, begin, end ) if author_content == '': return gmtime(0) else: begin = '</span>' end = '<span class="bullet">' date = find_between( author_content, begin, end ) date = date.strip() c = pdc.Constants() p = pdt.Calendar(c) most_recent_pub_date = date.replace('on','',1).strip() date = p.parse("this year") result = p.parseDateText( " ".join( [ most_recent_pub_date, str( date[0].tm_year ) ] ) ) return result
def set_sector(self): begin = '<a href="screener.ashx?v=111&f=sec_' end = '" class="tab-link">' sec = find_between( self.html, begin, end ) begin = '<a href="screener.ashx?v=111&f=sec_' + sec + '" class="tab-link">' end = '</a>' sector = find_between( self.html, begin, end ) self.sector = sector.strip()
def set_industry(self): begin = '<a href="screener.ashx?v=111&f=ind_' end = '" class="tab-link">' ind = find_between( self.html, begin, end ) begin = '<a href="screener.ashx?v=111&f=ind_' + ind + '" class="tab-link">' end = '</a>' industry = find_between( self.html, begin, end ) self.industry = industry.strip()
def get_comic_info(comic_id): r = requests.get('http://www.manhuagui.com/comic/%s/' % comic_id, headers=headers) title = find_between(r.text, '<div class="book-title">', '</div>') title = re.sub('<[^>]*>', '', title).strip() intro = find_between(r.text, '<div id="intro-all" class="none">', '</div>') intro = re.sub('<[^>]*>', '', intro).strip() print(title) print(intro) try: os.mkdir(title) except OSError as e: if e.errno != errno.EEXIST: raise os.chdir(title) index = 0 i = 0 while True: index = r.text.find( r'''<div class="chapter-list cf mt10" id='chapter-list-0'>''', index + 1) if index != -1: chapter_title = find_between_r(r.text, '<span>', '</span>', end=index) chapter_text = find_between( r.text, r'''<div class="chapter-list cf mt10" id='chapter-list-0'>''', '</div>', index) print(chapter_title) l = get_chapter_info(chapter_text) try: os.mkdir(chapter_title) except OSError as e: if e.errno != errno.EEXIST: raise os.chdir(chapter_title) for chapter in l: get_pic('http://www.manhuagui.com/%s' % chapter[0]) #with open('chapter-%s-%s.txt' % (i, chapter_title), 'w') as f: #json.dump(l, f) i += 1 os.chdir('..') else: break os.chdir('..')
def getFoods(campusnum, mealnum): mealJson = {} soup = getHTML(campusnum, mealnum) if soup == 'error': return 'error' log.status('Scraping foods for meal: {}'.format(mealNames[mealnum])) categ = soup.find_all('p', attrs={'style': 'margin: 3px 0;'}) categories = [] for cat in categ: categories.append(cat.b.string.replace('--', '').strip()) for a in range(0, len(categ) - 1): souptext = find_between(str(soup), str(categ[a]), str(categ[a + 1])) soup2 = BeautifulSoup(souptext, 'lxml') items = soup2.find_all('div', attrs={'class', 'col-1'}) mealJson[categories[a]] = {} for item in items: itID = item.input['value'].replace('*', '%2A') tag = item.find('input') tag.extract() mealJson[categories[a]][item.string] = getNutritionInfo(itID) if mealJson[categories[a]][ item.string] == 'No Information Available for this Item': log.warn('No information available for item: {}'.format( item.string)) if mealJson == {}: return 'No foods available for this meal today' return mealJson
def getibiblio(word): html = \ urllib.request.urlopen( 'http://www.ibiblio.org/webster/cgi-bin/headword_search.pl?query=' + word.replace(' ', '+')).read().decode() s = re.sub(' +', ' ', utils.find_between(html, '<def>', '</def>' ).strip()) + '.' return utils.strip_html_tags(s)
def set_beta(self): begin = 'body=[Beta]' end = '</b>' beta_html = find_between( self.html, begin, end ) if beta_html == '' or len(beta_html) == 1: self.beta = beta_html return beta = beta_html.split('<b>')[1] self.beta = re.sub(r'<[^>]*?>', '', beta).strip()
def wrapper(*args, **kwargs): # 接收 tid # log('take request: <{}>'.format(request.path)) username = find_between(request.path, '/user/', '/')[0] log('take username: <{}>'.format(username)) u = current_user() if u.username == username: return f(*args, **kwargs) else: abort(502)
def recv(fp): if fp.external() and isredeclipse(fp): redflare = fp.server.import_module('redflare.redflare', False) commands = fp.server.import_module('commands', False) if fp.sp.iscode('QUIT') or fp.sp.iscode('PART'): for k in fp.server.state: if k.find('%s.authname.' % fp.sp.sendernick) == 0: fp.server.state[k] = '' ptext = "" fp.sp.text = fp.sp.text.replace('\x0f', '') try: fp.user = utils.find_between(fp.sp.text, '<', '> ') except IndexError: fp.user = '' fp.setaccess("%s==" % fp.user) authname = "" rf = redflare.RedFlare('http://redflare.ofthings.net/reports') entry = fp.server.db['reservers'][fp.sp.sendernick].split(':') host = ':'.join(entry[0:-1]) if host == '%': host = fp.sp.host.split('@')[1] host = socket.gethostbyaddr(host)[2][0] if not host: host = '' for server in rf.servers: if server['host'] == host and server['port'] == int(entry[-1]): for player in server['playerauths']: if player[0] == fp.user: authname = player[1] fp.setaccess("%s==re:%s" % (fp.user, authname)) try: text = fp.sp.text[fp.sp.text.index('> ') + 2:] except ValueError: return prefixt = fp.channel.entry['prefix'].split() possible = [ fp.server.nick + ', ', fp.server.nick + ': ', ] + prefixt found = False prefix = prefixt[0] for p in possible: if text.find(p) == 0: found = True prefix = p break if not found: return ptext = text[len(prefix):] if len(ptext.lstrip(string.punctuation)) < len(ptext): return if ptext: fp.reply(commands.doptext(fp, ptext))
def wrapper(*args, **kwargs): # 接收 tid # log('take request: <{}>'.format(request.path)) tid = find_between(request.path, '/topic/', '/')[0] log('take tid: <{}>'.format(tid)) u = current_user() t = Topic.one(id=tid) if t.user_id == u.id: return f(*args, **kwargs) else: abort(502)
def get_pic(url): s = requests.Session() r = s.get(url, headers=headers) js = find_between(r.text, r'["\x65\x76\x61\x6c"]', '</script>') info = execjs.compile(LZjs).eval(js) info = find_between(info, 'cInfo=', '||{};') info = json.loads(info) print(info) name = info['cname'] path = info['path'] pages = info['len'] dir_name = '%s-%sp' % (name, pages) try: os.mkdir(dir_name) except OSError as e: if e.errno != errno.EEXIST: raise args_list = [] i = 0 for filename in info['files']: i += 1 if filename.endswith('.webp'): filename = filename[:-5] pic_url = 'http://{}{}{}'.format(servers[0], path, filename) print(pic_url) _headers = headers _headers['referer'] = url ext = os.path.splitext(filename)[1] args_list.append((s, pic_url, _headers, dir_name, '%s%s' % (i, ext))) threads = [threading.Thread(target=dlfile, args=a) for a in args_list] [t.start() for t in threads] [t.join() for t in threads]
def __init__(self, screen ): self.screen = screen pending_tickers = self.get_pending_tickers() url = make_finviz_url( screen ) pretty_html = get_pretty_html( url ) self.tickers = [] counter = 0 max_len = random.randrange(6, 7) while find_between( pretty_html, '<a href="quote.ashx?t=', '&' ): if counter > max_len: break ticker = find_between( pretty_html, '<a href="quote.ashx?t=', '&' ) pretty_html = pretty_html.replace('<a href="quote.ashx?t=','',1) if ticker in pending_tickers: continue if ticker in excluded_tickers: continue print "testing ticker " + ticker # only take tickers older than 25 days age = get_ticker_article_age( ticker ) if is_old_enough( age ): self.tickers.append( ticker ) counter = counter + 1
def set_short(self): begin = 'body=[Short interest share]' end = '</b>' short_html = find_between( self.html, begin, end ) if short_html == '' or len(short_html) == 1: self.short_interest = short_html return short = short_html.split('<b>')[1] short = re.sub(r'<[^>]*?>', '', short).strip() temp_short = short.replace('%','') if temp_short == '-' or float(temp_short) > 12: self.short_interest = '100%' return self.short_interest = short
def set_cap(self): begin = 'body=[Market capitalization]' end = '</b>' cap_html = find_between( self.html, begin, end ) if cap_html == '' or len(cap_html) == 1: self.cap = cap_html return cap = cap_html.split('<b>')[1] cap = re.sub(r'<[^>]*?>', '', cap).strip() if 'M' in cap: temp_cap = cap.replace('M','') if long(float(temp_cap)) < 100: self.cap = '0' return self.cap = cap
def set_screen_pairs(self): ind = 0 for screen in self.screen: if screen in ['D','DH','DVH']: ind = self.screen.index(screen) + 1 self.screen.insert( ind,'PO' ) break for screen in self.screen: if screen == '' or screen in ignored: continue begin = "body=[%s]" % html_maps[screen] end = '</b>' screen_html = find_between( self.html, begin, end ) if screen_html == '' or len(screen_html) == 1: continue screen_value = screen_html.split('<b>')[1] pair = [ abbrevs[screen], re.sub(r'<[^>]*?>', '', screen_value).strip() ] self.screen_pairs.append( pair ) if 'PO' in self.screen: self.screen.remove('PO')
def set_desc(self): begin = '<td class="fullview-profile" align="left">' end = '</td>' desc = find_between( self.html, begin, end ) self.desc = desc.strip()
def get_title(url, timeout=1): r = requests.get(url, timeout=timeout) return utils.find_between(r.content.decode(), '<title>', '</title>')
import utils css_base = 'downloads/' for css_file in open("css.txt", 'r'): css_file = css_base + css_file.strip() print "Base: " + css_file for line in open(css_file, 'r'): if ('url(' in line) or ('url (' in line): print utils.find_between(line.strip(), "url(", ")").replace("\"", "").replace("'", "")
def getEntityLocation(location_string): census_hi_file = "census_hindi_sd.csv" cdf = pandas.read_csv(census_hi_file) cdf['name_hi'] = cdf['name_hi'].str.strip() ############ SEGREGATING ALL THE STATE, DISTRICTS, SUB-DISTRICTs, PANCHAYATS/TOWNS, M. CORP. INTO DIFFERENT DATAFRAMES################ states = cdf[['state_code', 'name_en', 'name_hi' ]][(cdf.district_code == 0) & (cdf.subdistrict_code == 0) & (cdf.panchayat_town_code == 0) & (cdf.state_code != 0)] state = states.set_index('state_code') districts = cdf[[ 'district_code', 'name_en', 'name_hi' ]][(cdf.subdistrict_code == 0) & (cdf.panchayat_town_code == 0) & (cdf.state_code != 0) & (cdf.district_code != 0)] district = districts.set_index('district_code') sub_districts = cdf[[ 'subdistrict_code', 'name_en', 'name_hi' ]][(cdf.panchayat_town_code == 0) & (cdf.state_code != 0) & (cdf.district_code != 0) & (cdf.subdistrict_code != 0)] sub_district = sub_districts.set_index('subdistrict_code') panchayats_towns = cdf[[ 'panchayat_town_code', 'name_en', 'name_hi' ]][(cdf.state_code != 0) & (cdf.district_code != 0) & (cdf.subdistrict_code != 0) & (cdf.panchayat_town_code != 0)] ################################################# OUTPUT CSV FILES ################################################################## output_file = "output.csv" output_file_fields = [ 'File', 'locations', 'State', 'District', 'Subdistrict', 'Panchayat/town', 'Perfect Match', 'location : hamming d/s', 'time(in sec)' ] #Writing fields into csv files with open(output_file, 'w', newline='') as csvfile: # creating a csv writer object csvwriter = csv.writer(csvfile) # writing the fields csvwriter.writerow(output_file_fields) set_strings = set() #unique strings ################################################# MAIN LOOP STARTS ################################################################## flag_perfectmatch = False #flag to track the perfect match or not S = [] D = [] SD = [] PT = "" Loc_hd = "" list_output = [] locations = location_string if locations == 'n': #location contains NaN and can't be processed print("can't understand") return (-1, S, D, SD, PT) location_entity = list(dict.fromkeys( locations.split(','))) #separating the entities print("Entities : ", location_entity) for location in location_entity: #### for a single entity in a loop #### State Direct Match Code for loc in list(states["name_hi"]): if " " + loc + " " in location: S.append(loc) location = location.replace(loc, '') alphastate = 0 flag_perfectmatch = True break #### District Direct Match Code if len(S) != 0: for s in S: indexnum = cdf[cdf['name_hi'] == s].index.values.astype(int)[0] statenum = cdf["state_code"][indexnum] inState = find_between(cdf, 'state_code', statenum, statenum + 1) districtsinState = inState[[ 'district_code', 'name_en', 'name_hi' ]][(cdf.subdistrict_code == 0) & (cdf.panchayat_town_code == 0) & (cdf.state_code != 0) & (cdf.district_code != 0)] possibledistricts = districtsinState # print(possibledistricts) else: possibledistricts = districts for loc in list(possibledistricts["name_hi"]): if " " + loc + " " in location: D.append(loc) location = location.replace(loc, '') alphadistrict = 0 flag_perfectmatch = True break #### Subdistrict Direct Match Code if len(D) != 0: for d in D: indexnum = cdf[cdf['name_hi'] == d].index.values.astype(int)[0] districtnum = cdf["district_code"][indexnum] inDistrict = find_between(cdf, 'district_code', districtnum, districtnum + 1) subdistrictsinstate = inDistrict[[ 'subdistrict_code', 'name_en', 'name_hi' ]][(cdf.panchayat_town_code == 0) & (cdf.state_code != 0) & (cdf.district_code != 0) & (cdf.subdistrict_code != 0)] possiblesubdistricts = subdistrictsinstate elif len(S) != 0: for s in S: indexnum = cdf[cdf['name_hi'] == s].index.values.astype(int)[0] statenum = cdf["state_code"][indexnum] inState = find_between(cdf, 'state_code', statenum, statenum + 1) subdistrictsinstate = inState[[ 'subdistrict_code', 'name_en', 'name_hi' ]][(cdf.panchayat_town_code == 0) & (cdf.state_code != 0) & (cdf.district_code != 0) & (cdf.subdistrict_code != 0)] possiblesubdistricts = subdistrictsinstate else: possiblesubdistricts = sub_districts for loc in list(possiblesubdistricts["name_hi"]): if " " + loc + " " in location: SD.append(loc) location = location.replace(loc, '') alphasubdistrict = 0 flag_perfectmatch = True break #### Backpropagate States, Districts if len(D) == 0 and len(SD) != 0: for sd in SD: l = ( possiblesubdistricts[possiblesubdistricts["name_hi"] == sd] ).index.tolist() #Index of all matched rows for ll in l: #for each index print corresponding District,State print("District Code: ", cdf.at[ll, 'district_code'], ", District: ", district.at[cdf.at[ll, 'district_code'], 'name_hi']) D.append(district.at[cdf.at[ll, 'district_code'], 'name_hi']) if len(S) == 0 and len(D) != 0: for d in D: l = (possibledistricts[possibledistricts["name_hi"] == d] ).index.tolist() #Index of all matched rows for ll in l: #for each index print corresponding State print("State Code: ", cdf.at[ll, 'state_code'], ", State: ", state.at[cdf.at[ll, 'state_code'], 'name_hi']) S.append(state.at[cdf.at[ll, 'state_code'], 'name_hi']) #### Approximate Matching if len(S) == 0: min_d_state = 5 min_s_state = "" for loc in list(states["name_hi"]): lenloc = len(loc.split()) tokenised_instance = location.split() ngrams = list( zip(*[tokenised_instance[i:] for i in range(lenloc)])) ngrams = [' '.join(ngram) for ngram in ngrams] for ng in ngrams: d = textdistance.hamming(ng, loc) #Hamming textdistance algo if (d < min_d_state): min_s_state = loc min_d_state = d if len(D) == 0: min_d_district = 5 min_s_district = "" for loc in list(possibledistricts["name_hi"]): lenloc = len(loc.split()) tokenised_instance = location.split() ngrams = list( zip(*[tokenised_instance[i:] for i in range(lenloc)])) ngrams = [' '.join(ngram) for ngram in ngrams] for ng in ngrams: d = textdistance.hamming(ng, loc) #Hamming textdistance algo if (d < min_d_district): min_s_district = loc min_d_district = d alpha = 5 if len(S) == 0 and len(D) == 0: if min_s_state != "" and min_s_district != "": if min_d_district < min_d_state: D.append(min_s_district) alpha = min_d_district else: S.append(min_s_state) alpha = min_d_state elif min_s_district != "": D.append(min_s_district) alpha = min_d_district elif min_s_state != "": S.append(min_s_state) alpha = min_d_state elif len(D) == 0: if min_s_district != "": D.append(min_s_district) alpha = min_d_district #### Backpropagate States, Districts if len(D) == 0 and len(SD) != 0: for sd in SD: l = ( possiblesubdistricts[possiblesubdistricts["name_hi"] == sd] ).index.tolist() #Index of all matched rows for ll in l: #for each index print corresponding District,State print("District Code: ", cdf.at[ll, 'district_code'], ", District: ", district.at[cdf.at[ll, 'district_code'], 'name_hi']) D.append(district.at[cdf.at[ll, 'district_code'], 'name_hi']) if len(S) == 0 and len(D) != 0: for d in D: l = (possibledistricts[possibledistricts["name_hi"] == d] ).index.tolist() #Index of all matched rows for ll in l: #for each index print corresponding State print("State Code: ", cdf.at[ll, 'state_code'], ", State: ", state.at[cdf.at[ll, 'state_code'], 'name_hi']) S.append(state.at[cdf.at[ll, 'state_code'], 'name_hi']) list_output.append(locations) print(S, D, SD, PT) if flag_perfectmatch: list_output.append("Yes") return (0, S, D, SD, PT) else: list_output.append("No") return (alpha, S, D, SD, PT) return
def load_data(folderpath): """ Open CSV files in a folder and index the data. """ files = os.listdir(folderpath) for f in files: fullname = folderpath + f if os.path.isfile(fullname): with open(fullname, 'rb') as csvfile: eventsreader = csv.reader(csvfile, delimiter=',', quotechar='"') for line, row in enumerate(eventsreader): if line == 0: for column in range(0, len(row)): if row[column] == "TimeCreated": TIME_IDX = column elif row[column] == "Id": ID_IDX = column elif row[column] == "MachineName": MACHINENAME_IDX = column elif row[column] == "TargetUserName": TARGETUSERNAME_IDX = column elif row[column] == "SubjectUserName": SUBJECTUSERNAME_IDX = column elif row[column] == "LogonType": LOGONTYPE_IDX = column elif row[column] == "ProcessName": PROCESSNAME_IDX = column elif row[column] == "IpAddress": IP_IDX = column elif row[column] == "WorkstationName": WORKSTATIONAME_IDX = column elif row[column] == "ServiceName": SERVICENAME_IDX = column elif row[column] == "ImagePath": IMAGEPATH_IDX = column elif row[column] == "ServiceType": SERVICETYPE_IDX = column elif row[column] == "Status": STATUS_IDX = column elif row[column] == "SubStatus": SUBSTATUS_IDX = column elif row[column] == "TaskName": TASKNAME_IDX = column elif row[column] == "TaskContent": TASKCONTENT_IDX = column elif row[column] == "Message": MESSAGGE_IDX = column elif row[column] == "ShareName": SHARENAME_IDX = column elif row[column] == "PrivilegeList": PRIVILEGE_IDX = column print "reading ", f, " line:", line #4624 Authentication event if row[ID_IDX] == "4624": strtime = row[TIME_IDX] time = getdate(strtime) hostname = row[MACHINENAME_IDX] username = row[TARGETUSERNAME_IDX].lower() logontype = getlogontype(str(row[LOGONTYPE_IDX])) processname = row[PROCESSNAME_IDX] sourceip = row[IP_IDX] sourcehost = row[WORKSTATIONAME_IDX] host, created_host = Host.objects.get_or_create( hostname=hostname) user, created_user = User.objects.get_or_create( username=username) if (logontype == "Network" or logontype == "RemoteInteractive") and ( sourceip != "-" and ":" not in sourceip): source, created_source = SourceIp.objects.get_or_create( sourceip=sourceip, hostname=sourcehost) event4624, createdevent = Event_4624.objects.get_or_create( time=time, host=host, user=user, logontype=logontype, processname=processname, sourcehost=sourcehost, sourceip=source) else: try: event4624, createdevent = Event_4624.objects.get_or_create( time=time, host=host, user=user, logontype=logontype, processname=processname, sourcehost=sourcehost) except: pass if createdevent: print "new 4624 created for host " + hostname # 4625 Authentication event if row[ID_IDX] == "4625": strtime = row[TIME_IDX] time = getdate(strtime) hostname = row[MACHINENAME_IDX] username = row[TARGETUSERNAME_IDX].lower() logontype = getlogontype(str(row[LOGONTYPE_IDX])) processname = row[PROCESSNAME_IDX] sourceip = row[IP_IDX] sourcehost = row[WORKSTATIONAME_IDX] status = row[STATUS_IDX] substatus = row[SUBSTATUS_IDX] host, created_host = Host.objects.get_or_create( hostname=hostname) user, created_user = User.objects.get_or_create( username=username) #new if (logontype == "Network" or logontype == "RemoteInteractive") and ( sourceip != "-" and ":" not in sourceip): source, created_source = SourceIp.objects.get_or_create( sourceip=sourceip, hostname=sourcehost) event4625, createdevent = Event_4625.objects.get_or_create( time=time, host=host, user=user, logontype=logontype, processname=processname, sourcehost=sourcehost, sourceip=source, status=status, substatus=substatus) else: event4625, createdevent = Event_4625.objects.get_or_create( time=time, host=host, user=user, logontype=logontype, processname=processname, sourcehost=sourcehost, status=status, substatus=substatus) if createdevent: print "new 4625 created for host " + hostname # 4776 Local Authentication Event if row[ID_IDX] == "4776": strtime = row[TIME_IDX] time = getdate(strtime) hostname = row[MACHINENAME_IDX] username = row[TARGETUSERNAME_IDX].lower() status = row[STATUS_IDX] host, created_host = Host.objects.get_or_create( hostname=hostname) user, created_user = User.objects.get_or_create( username=username) event4776, createdevent = Event_4776.objects.get_or_create( time=time, host=host, user=user, status=status) if createdevent: print "new 4776 created for host " + hostname # 5140 File Share Event if row[ID_IDX] == "5140": strtime = row[TIME_IDX] time = getdate(strtime) hostname = row[MACHINENAME_IDX] username = row[SUBJECTUSERNAME_IDX].lower() sharename = row[SHARENAME_IDX] host, created_host = Host.objects.get_or_create( hostname=hostname) user, created_user = User.objects.get_or_create( username=username) event5140, createdevent = Event_5140.objects.get_or_create( time=time, host=host, user=user, sharename=sharename) if createdevent: print "new 5140 created for host " + hostname # 4672 Privileged Auth if row[ID_IDX] == "4672": strtime = row[TIME_IDX] time = getdate(strtime) hostname = row[MACHINENAME_IDX] username = row[SUBJECTUSERNAME_IDX].lower() privilegelist = row[PRIVILEGE_IDX] host, created_host = Host.objects.get_or_create( hostname=hostname) user, created_user = User.objects.get_or_create( username=username) event4672, createdevent = Event_4672.objects.get_or_create( time=time, host=host, user=user, privilegelist=privilegelist) if createdevent: print "new 4672 created for host " + hostname # 7045 New Service created event elif row[ID_IDX] == "7045": strtime = row[TIME_IDX] time = getdate(strtime) hostname = row[MACHINENAME_IDX] servicename = row[SERVICENAME_IDX] imagepath = row[IMAGEPATH_IDX] servicetype = row[SERVICETYPE_IDX] service, service_created = Service.objects.get_or_create( servicename=servicename, imagepath=imagepath, servicetype=servicetype) host, created_host = Host.objects.get_or_create( hostname=hostname) event7045, createdevent = Event_7045.objects.get_or_create( time=time, host=host, service=service) if createdevent: print "new 7045 created for " + hostname # 4698 New Scheduled Task event elif row[ID_IDX] == "4698": strtime = row[TIME_IDX] time = getdate(strtime) hostname = row[MACHINENAME_IDX] taskname = row[TASKNAME_IDX] taskdetails = row[TASKCONTENT_IDX] #arguments = re.findall('<Arguments>(.*?)</Arguments>', taskdetails, re.DOTALL)[0] #cmdline = re.findall('<Command>(.*?)</Command>', taskdetails, re.DOTALL)[0] try: arguments = find_between(taskdetails, "<Arguments>", "</Arguments>") except: arguments = "" try: cmdline = find_between(taskdetails, "<Command>", "</Command>") except: cmdline = "" host, created_host = Host.objects.get_or_create( hostname=hostname) task, taskcreated = Task.objects.get_or_create( taskname=taskname, imagepath=cmdline, arguments=arguments) event4698, createdevent = Event_4698.objects.get_or_create( time=time, host=host, task=task) if createdevent: print "new 4698 created for " + hostname # WMI event elif row[ID_IDX] == "2": strtime = row[TIME_IDX] time = getdate(strtime) hostname = row[MACHINENAME_IDX] wmidetails = row[MESSAGGE_IDX] operationid = re.findall('GroupOperationId = (.*?);', wmidetails, re.DOTALL)[0] operation = re.findall('Operation = (.*?);', wmidetails, re.DOTALL)[0] host, created_host = Host.objects.get_or_create( hostname=hostname) WmiEvent_2.objects.create(time=time, host=host, operationid=operationid, operation=operation) print "new Wmi Event 2 created for " + hostname
def parser(log, format="str"): row = "" splitted = log.split() gr2 = group_by_quote.search(log).group().replace('"', '').split() # Request ID request_id = str(uuid.uuid4()) # Visitor ID visitor_id = id_generator() # UserID user_id = get_next_user_id().next() process_name = "nginx" ip = splitted[0] request_type = gr2[0] # Get url with different way to deal with database. url = urlparse.urlparse(gr2[1]).path # 06/Nov/2015:06:47:45 +0000 # str_dt = re.search(r'[0-9]{2}\/.*\/[0-9]{4}\:[0-9]{2}\:[0-9]{2}\:[0-9]{2}\s\+[\d]+', log).group(), # str_dt = datetime_parser.search(log).group(), str_dt = find_between(log, "[", "]").lstrip() dt_format = "%d/%b/%Y:%H:%M:%S +0000" dt = datetime.strptime(str_dt, dt_format) try: dt = dt.replace(tzinfo=tz.gettz('UTC')).astimezone(timezone) except: import traceback traceback.print_exc() # row += "%s|" % dt.strftime("%Y-%m-%d %H:%M:%S") status = int(gr2[3]) # row += "%s|" % status params = create_param(urlparse.parse_qs(urlparse.urlparse(gr2[1]).query)) redirection_url = gr2[5] if redirection_url == "-": redirection_url = "" # prepare data. params["request_id"] = request_id params["redirection_url"] = redirection_url params["user_id"] = user_id if user_id else 'NULL' params["visitor_id"] = visitor_id params["process_name"] = process_name params["ip"] = ip params["request_type"] = request_type params["url"] = url params["event_at"] = "%s" % dt.strftime("%Y-%m-%d %H:%M:%S") params["http_status"] = status params["created_at"] = datetime.now().strftime("%Y-%m-%d %H:%M:%S") row = "" if format == "str": row = get_row_format().format(**params) elif format == "tuple": row = params.values() elif format == "json": return params return row
def set_name(self): begin = 'target="_blank" class="tab-link">' end = '</a>' name = find_between( self.html, begin, end ) self.name = name.replace('<b>','').replace('</b>','').strip()
f for f in listdir(cfg.PATH_OUTPUT) if (isfile(join(cfg.PATH_OUTPUT, f)) & (f.endswith('.txt'))) ]: results = dict() title = '' f = '' fstd = '' mae = '' maestd = '' for line in open(cfg.PATH_OUTPUT + fName, 'r'): line = line.replace('\n', '') if ('Executing for' in line): #write away previous results results[title] = [mae, maestd, f, fstd] title = 'cross_' + utils.find_between(line, 'Executing for ', 'model').strip() elif ('Average Mean absolute error (official): ' in line): mae = line.split("(official): ", 1)[1] elif ('Average Mean absolute error (official); std: ' in line): maestd = line.split('std: ', 1)[1] elif ('Average F1-measure: ' in line): f = line.split("measure: ", 1)[1] elif ('Average F1-measure; std: ' in line): fstd = line.split("std: ", 1)[1] elif ("Scores for " in line): #write away previous, doing for test now results[title] = [mae, maestd, f, fstd] title = 'test_' + utils.find_between(line, 'Scores for ', ':').strip() results[title] = [mae, maestd, f, fstd]