Beispiel #1
0
def get_ticker_article_age( sym ):

    sym_url = 'http://seekingalpha.com/symbol/' + sym.lower()
    
    html = get_pretty_html( sym_url )

    begin = '<a href="/author/zetakap">'
    end = '</li>'
    author_content = find_between( html, begin, end )

    if author_content == '':
        return gmtime(0)
    else:
        begin = '</span>'
        end = '<span class="bullet">'              
        date = find_between( author_content, begin, end )

        date = date.strip()

        c = pdc.Constants()
        p = pdt.Calendar(c)
    
        most_recent_pub_date = date.replace('on','',1).strip()
        date = p.parse("this year")

        result = p.parseDateText( " ".join( [ most_recent_pub_date, str( date[0].tm_year ) ] ) )

        return result
Beispiel #2
0
 def set_sector(self):
     begin = '<a href="screener.ashx?v=111&amp;f=sec_'
     end = '" class="tab-link">'
     sec = find_between( self.html, begin, end )
     begin = '<a href="screener.ashx?v=111&amp;f=sec_' + sec + '" class="tab-link">'
     end = '</a>'
     sector = find_between( self.html, begin, end )
     self.sector = sector.strip()
Beispiel #3
0
 def set_industry(self):
     begin = '<a href="screener.ashx?v=111&amp;f=ind_'
     end = '" class="tab-link">'
     ind = find_between( self.html, begin, end )
     begin = '<a href="screener.ashx?v=111&amp;f=ind_' + ind + '" class="tab-link">'
     end = '</a>'
     industry = find_between( self.html, begin, end )
     self.industry = industry.strip()
Beispiel #4
0
def get_comic_info(comic_id):
    r = requests.get('http://www.manhuagui.com/comic/%s/' % comic_id,
                     headers=headers)

    title = find_between(r.text, '<div class="book-title">', '</div>')
    title = re.sub('<[^>]*>', '', title).strip()
    intro = find_between(r.text, '<div id="intro-all" class="none">', '</div>')
    intro = re.sub('<[^>]*>', '', intro).strip()

    print(title)
    print(intro)

    try:
        os.mkdir(title)
    except OSError as e:
        if e.errno != errno.EEXIST: raise
    os.chdir(title)

    index = 0
    i = 0
    while True:
        index = r.text.find(
            r'''<div class="chapter-list cf mt10" id='chapter-list-0'>''',
            index + 1)
        if index != -1:
            chapter_title = find_between_r(r.text,
                                           '<span>',
                                           '</span>',
                                           end=index)
            chapter_text = find_between(
                r.text,
                r'''<div class="chapter-list cf mt10" id='chapter-list-0'>''',
                '</div>', index)
            print(chapter_title)
            l = get_chapter_info(chapter_text)

            try:
                os.mkdir(chapter_title)
            except OSError as e:
                if e.errno != errno.EEXIST: raise
            os.chdir(chapter_title)

            for chapter in l:
                get_pic('http://www.manhuagui.com/%s' % chapter[0])
            #with open('chapter-%s-%s.txt' % (i, chapter_title), 'w') as f:
            #json.dump(l, f)
            i += 1

            os.chdir('..')
        else:
            break

    os.chdir('..')
Beispiel #5
0
def getFoods(campusnum, mealnum):
    mealJson = {}
    soup = getHTML(campusnum, mealnum)
    if soup == 'error':
        return 'error'
    log.status('Scraping foods for meal: {}'.format(mealNames[mealnum]))
    categ = soup.find_all('p', attrs={'style': 'margin: 3px 0;'})
    categories = []
    for cat in categ:
        categories.append(cat.b.string.replace('--', '').strip())

    for a in range(0, len(categ) - 1):
        souptext = find_between(str(soup), str(categ[a]), str(categ[a + 1]))
        soup2 = BeautifulSoup(souptext, 'lxml')
        items = soup2.find_all('div', attrs={'class', 'col-1'})
        mealJson[categories[a]] = {}
        for item in items:
            itID = item.input['value'].replace('*', '%2A')
            tag = item.find('input')
            tag.extract()
            mealJson[categories[a]][item.string] = getNutritionInfo(itID)
            if mealJson[categories[a]][
                    item.string] == 'No Information Available for this Item':
                log.warn('No information available for item: {}'.format(
                    item.string))

    if mealJson == {}:
        return 'No foods available for this meal today'

    return mealJson
Beispiel #6
0
def getibiblio(word):
    html = \
    urllib.request.urlopen(
    'http://www.ibiblio.org/webster/cgi-bin/headword_search.pl?query='
    + word.replace(' ', '+')).read().decode()
    s = re.sub(' +', ' ', utils.find_between(html, '<def>', '</def>'
    ).strip()) + '.'
    return utils.strip_html_tags(s)
Beispiel #7
0
 def set_beta(self):
     begin = 'body=[Beta]'
     end = '</b>'
     beta_html = find_between( self.html, begin, end )
     if beta_html == '' or len(beta_html) == 1:
         self.beta = beta_html
         return
     beta = beta_html.split('<b>')[1]
     self.beta = re.sub(r'<[^>]*?>', '', beta).strip()
Beispiel #8
0
 def wrapper(*args, **kwargs):
     # 接收 tid
     # log('take request: <{}>'.format(request.path))
     username = find_between(request.path, '/user/', '/')[0]
     log('take username: <{}>'.format(username))
     u = current_user()
     if u.username == username:
         return f(*args, **kwargs)
     else:
         abort(502)
Beispiel #9
0
def recv(fp):
    if fp.external() and isredeclipse(fp):
        redflare = fp.server.import_module('redflare.redflare', False)
        commands = fp.server.import_module('commands', False)
        if fp.sp.iscode('QUIT') or fp.sp.iscode('PART'):
            for k in fp.server.state:
                if k.find('%s.authname.' % fp.sp.sendernick) == 0:
                    fp.server.state[k] = ''
        ptext = ""
        fp.sp.text = fp.sp.text.replace('\x0f', '')
        try:
            fp.user = utils.find_between(fp.sp.text, '<', '> ')
        except IndexError:
            fp.user = ''
        fp.setaccess("%s==" % fp.user)

        authname = ""
        rf = redflare.RedFlare('http://redflare.ofthings.net/reports')
        entry = fp.server.db['reservers'][fp.sp.sendernick].split(':')
        host = ':'.join(entry[0:-1])
        if host == '%':
            host = fp.sp.host.split('@')[1]
        host = socket.gethostbyaddr(host)[2][0]
        if not host:
            host = ''
        for server in rf.servers:
            if server['host'] == host and server['port'] == int(entry[-1]):
                for player in server['playerauths']:
                    if player[0] == fp.user:
                        authname = player[1]
        fp.setaccess("%s==re:%s" % (fp.user, authname))

        try:
            text = fp.sp.text[fp.sp.text.index('> ') + 2:]
        except ValueError:
            return
        prefixt = fp.channel.entry['prefix'].split()
        possible = [
            fp.server.nick + ', ',
            fp.server.nick + ': ',
            ] + prefixt
        found = False
        prefix = prefixt[0]
        for p in possible:
            if text.find(p) == 0:
                found = True
                prefix = p
                break
        if not found:
            return
        ptext = text[len(prefix):]
        if len(ptext.lstrip(string.punctuation)) < len(ptext):
            return
        if ptext:
            fp.reply(commands.doptext(fp, ptext))
Beispiel #10
0
 def wrapper(*args, **kwargs):
     # 接收 tid
     # log('take request: <{}>'.format(request.path))
     tid = find_between(request.path, '/topic/', '/')[0]
     log('take tid: <{}>'.format(tid))
     u = current_user()
     t = Topic.one(id=tid)
     if t.user_id == u.id:
         return f(*args, **kwargs)
     else:
         abort(502)
Beispiel #11
0
def get_pic(url):
    s = requests.Session()

    r = s.get(url, headers=headers)

    js = find_between(r.text, r'["\x65\x76\x61\x6c"]', '</script>')
    info = execjs.compile(LZjs).eval(js)
    info = find_between(info, 'cInfo=', '||{};')
    info = json.loads(info)

    print(info)

    name = info['cname']
    path = info['path']
    pages = info['len']

    dir_name = '%s-%sp' % (name, pages)
    try:
        os.mkdir(dir_name)
    except OSError as e:
        if e.errno != errno.EEXIST: raise

    args_list = []
    i = 0
    for filename in info['files']:
        i += 1
        if filename.endswith('.webp'):
            filename = filename[:-5]

        pic_url = 'http://{}{}{}'.format(servers[0], path, filename)
        print(pic_url)

        _headers = headers
        _headers['referer'] = url

        ext = os.path.splitext(filename)[1]
        args_list.append((s, pic_url, _headers, dir_name, '%s%s' % (i, ext)))

    threads = [threading.Thread(target=dlfile, args=a) for a in args_list]
    [t.start() for t in threads]
    [t.join() for t in threads]
Beispiel #12
0
 def __init__(self, screen ):
     self.screen = screen
     pending_tickers = self.get_pending_tickers()
     url = make_finviz_url( screen )
     pretty_html = get_pretty_html( url )
     self.tickers = []
     counter = 0
     max_len = random.randrange(6, 7) 
     while find_between( pretty_html, '<a href="quote.ashx?t=', '&amp;' ):
         if counter > max_len:
             break        
         ticker = find_between( pretty_html, '<a href="quote.ashx?t=', '&amp;' )
         pretty_html = pretty_html.replace('<a href="quote.ashx?t=','',1)
         if ticker in pending_tickers:
             continue
         if ticker in excluded_tickers:
             continue
         print "testing ticker " + ticker
         # only take tickers older than 25 days
         age = get_ticker_article_age( ticker )
         if is_old_enough( age ):
             self.tickers.append( ticker )
             counter = counter + 1
Beispiel #13
0
 def set_short(self):
     begin = 'body=[Short interest share]'
     end = '</b>'
     short_html = find_between( self.html, begin, end )
     if short_html == '' or len(short_html) == 1:
         self.short_interest = short_html
         return
     short = short_html.split('<b>')[1]
     short = re.sub(r'<[^>]*?>', '', short).strip()
     temp_short = short.replace('%','')
     if temp_short == '-' or float(temp_short) > 12:
         self.short_interest = '100%'
         return
     self.short_interest = short
Beispiel #14
0
 def set_cap(self):
     begin = 'body=[Market capitalization]'
     end = '</b>'
     cap_html = find_between( self.html, begin, end )
     if cap_html == '' or len(cap_html) == 1:
         self.cap = cap_html
         return
     cap = cap_html.split('<b>')[1]
     cap = re.sub(r'<[^>]*?>', '', cap).strip()
     if 'M' in cap:
         temp_cap = cap.replace('M','')
         if long(float(temp_cap)) < 100:
             self.cap = '0'
             return
     self.cap = cap
Beispiel #15
0
 def set_screen_pairs(self):
     ind = 0
     for screen in self.screen:
         if screen in ['D','DH','DVH']:
             ind = self.screen.index(screen) + 1
             self.screen.insert( ind,'PO' )
             break
     for screen in self.screen:
         if screen == '' or screen in ignored:
             continue
         begin = "body=[%s]" % html_maps[screen]
         end = '</b>'
         screen_html = find_between( self.html, begin, end )
         if screen_html == '' or len(screen_html) == 1:
             continue
         screen_value = screen_html.split('<b>')[1]
         pair = [ abbrevs[screen], re.sub(r'<[^>]*?>', '', screen_value).strip() ]
         self.screen_pairs.append( pair )
     if 'PO' in self.screen:
         self.screen.remove('PO')
Beispiel #16
0
 def set_desc(self):
     begin = '<td class="fullview-profile" align="left">'
     end = '</td>'
     desc = find_between( self.html, begin, end )
     self.desc = desc.strip()
Beispiel #17
0
def get_title(url, timeout=1):
    r = requests.get(url, timeout=timeout)
    return utils.find_between(r.content.decode(), '<title>', '</title>')
import utils
css_base = 'downloads/'

for css_file in open("css.txt", 'r'):
    css_file = css_base + css_file.strip()
    print "Base: " + css_file
    for line in open(css_file, 'r'):
        if ('url(' in line) or ('url (' in line):
            print utils.find_between(line.strip(), "url(", ")").replace("\"", "").replace("'", "")
def getEntityLocation(location_string):
    census_hi_file = "census_hindi_sd.csv"
    cdf = pandas.read_csv(census_hi_file)
    cdf['name_hi'] = cdf['name_hi'].str.strip()

    ############ SEGREGATING ALL THE STATE, DISTRICTS, SUB-DISTRICTs, PANCHAYATS/TOWNS, M. CORP. INTO DIFFERENT DATAFRAMES################

    states = cdf[['state_code', 'name_en', 'name_hi'
                  ]][(cdf.district_code == 0) & (cdf.subdistrict_code == 0) &
                     (cdf.panchayat_town_code == 0) & (cdf.state_code != 0)]
    state = states.set_index('state_code')
    districts = cdf[[
        'district_code', 'name_en', 'name_hi'
    ]][(cdf.subdistrict_code == 0) & (cdf.panchayat_town_code == 0) &
       (cdf.state_code != 0) & (cdf.district_code != 0)]
    district = districts.set_index('district_code')
    sub_districts = cdf[[
        'subdistrict_code', 'name_en', 'name_hi'
    ]][(cdf.panchayat_town_code == 0) & (cdf.state_code != 0) &
       (cdf.district_code != 0) & (cdf.subdistrict_code != 0)]
    sub_district = sub_districts.set_index('subdistrict_code')
    panchayats_towns = cdf[[
        'panchayat_town_code', 'name_en', 'name_hi'
    ]][(cdf.state_code != 0) & (cdf.district_code != 0) &
       (cdf.subdistrict_code != 0) & (cdf.panchayat_town_code != 0)]

    ################################################# OUTPUT CSV FILES ##################################################################
    output_file = "output.csv"
    output_file_fields = [
        'File', 'locations', 'State', 'District', 'Subdistrict',
        'Panchayat/town', 'Perfect Match', 'location : hamming d/s',
        'time(in sec)'
    ]

    #Writing fields into csv files
    with open(output_file, 'w', newline='') as csvfile:
        # creating a csv writer object
        csvwriter = csv.writer(csvfile)
        # writing the fields
        csvwriter.writerow(output_file_fields)

    set_strings = set()  #unique strings

    ################################################# MAIN LOOP STARTS ##################################################################
    flag_perfectmatch = False  #flag to track the perfect match or not
    S = []
    D = []
    SD = []
    PT = ""
    Loc_hd = ""
    list_output = []
    locations = location_string

    if locations == 'n':  #location contains NaN and can't be processed
        print("can't understand")
        return (-1, S, D, SD, PT)
    location_entity = list(dict.fromkeys(
        locations.split(',')))  #separating the entities
    print("Entities : ", location_entity)
    for location in location_entity:
        #### for a single entity in a loop
        #### State Direct Match Code
        for loc in list(states["name_hi"]):
            if " " + loc + " " in location:
                S.append(loc)
                location = location.replace(loc, '')
                alphastate = 0
                flag_perfectmatch = True
                break
        #### District Direct Match Code
        if len(S) != 0:
            for s in S:
                indexnum = cdf[cdf['name_hi'] == s].index.values.astype(int)[0]
                statenum = cdf["state_code"][indexnum]
                inState = find_between(cdf, 'state_code', statenum,
                                       statenum + 1)
                districtsinState = inState[[
                    'district_code', 'name_en', 'name_hi'
                ]][(cdf.subdistrict_code == 0) & (cdf.panchayat_town_code == 0)
                   & (cdf.state_code != 0) & (cdf.district_code != 0)]
            possibledistricts = districtsinState
            # print(possibledistricts)
        else:
            possibledistricts = districts

        for loc in list(possibledistricts["name_hi"]):
            if " " + loc + " " in location:
                D.append(loc)
                location = location.replace(loc, '')
                alphadistrict = 0
                flag_perfectmatch = True
                break
        #### Subdistrict Direct Match Code
        if len(D) != 0:
            for d in D:
                indexnum = cdf[cdf['name_hi'] == d].index.values.astype(int)[0]
                districtnum = cdf["district_code"][indexnum]
                inDistrict = find_between(cdf, 'district_code', districtnum,
                                          districtnum + 1)
                subdistrictsinstate = inDistrict[[
                    'subdistrict_code', 'name_en', 'name_hi'
                ]][(cdf.panchayat_town_code == 0) & (cdf.state_code != 0) &
                   (cdf.district_code != 0) & (cdf.subdistrict_code != 0)]
            possiblesubdistricts = subdistrictsinstate
        elif len(S) != 0:
            for s in S:
                indexnum = cdf[cdf['name_hi'] == s].index.values.astype(int)[0]
                statenum = cdf["state_code"][indexnum]
                inState = find_between(cdf, 'state_code', statenum,
                                       statenum + 1)
                subdistrictsinstate = inState[[
                    'subdistrict_code', 'name_en', 'name_hi'
                ]][(cdf.panchayat_town_code == 0) & (cdf.state_code != 0) &
                   (cdf.district_code != 0) & (cdf.subdistrict_code != 0)]
            possiblesubdistricts = subdistrictsinstate
        else:
            possiblesubdistricts = sub_districts
        for loc in list(possiblesubdistricts["name_hi"]):
            if " " + loc + " " in location:
                SD.append(loc)
                location = location.replace(loc, '')
                alphasubdistrict = 0
                flag_perfectmatch = True
                break
        #### Backpropagate States, Districts
        if len(D) == 0 and len(SD) != 0:
            for sd in SD:
                l = (
                    possiblesubdistricts[possiblesubdistricts["name_hi"] == sd]
                ).index.tolist()  #Index of all matched rows
                for ll in l:  #for each index print corresponding District,State
                    print("District Code: ", cdf.at[ll, 'district_code'],
                          ", District: ",
                          district.at[cdf.at[ll, 'district_code'], 'name_hi'])
                    D.append(district.at[cdf.at[ll, 'district_code'],
                                         'name_hi'])

        if len(S) == 0 and len(D) != 0:
            for d in D:
                l = (possibledistricts[possibledistricts["name_hi"] == d]
                     ).index.tolist()  #Index of all matched rows
                for ll in l:  #for each index print corresponding State
                    print("State Code: ", cdf.at[ll, 'state_code'],
                          ", State: ", state.at[cdf.at[ll, 'state_code'],
                                                'name_hi'])
                    S.append(state.at[cdf.at[ll, 'state_code'], 'name_hi'])

        #### Approximate Matching
        if len(S) == 0:
            min_d_state = 5
            min_s_state = ""
            for loc in list(states["name_hi"]):
                lenloc = len(loc.split())
                tokenised_instance = location.split()
                ngrams = list(
                    zip(*[tokenised_instance[i:] for i in range(lenloc)]))
                ngrams = [' '.join(ngram) for ngram in ngrams]
                for ng in ngrams:
                    d = textdistance.hamming(ng,
                                             loc)  #Hamming textdistance algo
                    if (d < min_d_state):
                        min_s_state = loc
                        min_d_state = d
        if len(D) == 0:
            min_d_district = 5
            min_s_district = ""
            for loc in list(possibledistricts["name_hi"]):
                lenloc = len(loc.split())
                tokenised_instance = location.split()
                ngrams = list(
                    zip(*[tokenised_instance[i:] for i in range(lenloc)]))
                ngrams = [' '.join(ngram) for ngram in ngrams]
                for ng in ngrams:
                    d = textdistance.hamming(ng,
                                             loc)  #Hamming textdistance algo
                    if (d < min_d_district):
                        min_s_district = loc
                        min_d_district = d
        alpha = 5
        if len(S) == 0 and len(D) == 0:
            if min_s_state != "" and min_s_district != "":
                if min_d_district < min_d_state:
                    D.append(min_s_district)
                    alpha = min_d_district
                else:
                    S.append(min_s_state)
                    alpha = min_d_state
            elif min_s_district != "":
                D.append(min_s_district)
                alpha = min_d_district
            elif min_s_state != "":
                S.append(min_s_state)
                alpha = min_d_state
        elif len(D) == 0:
            if min_s_district != "":
                D.append(min_s_district)
                alpha = min_d_district

        #### Backpropagate States, Districts
        if len(D) == 0 and len(SD) != 0:
            for sd in SD:
                l = (
                    possiblesubdistricts[possiblesubdistricts["name_hi"] == sd]
                ).index.tolist()  #Index of all matched rows
                for ll in l:  #for each index print corresponding District,State
                    print("District Code: ", cdf.at[ll, 'district_code'],
                          ", District: ",
                          district.at[cdf.at[ll, 'district_code'], 'name_hi'])
                    D.append(district.at[cdf.at[ll, 'district_code'],
                                         'name_hi'])

        if len(S) == 0 and len(D) != 0:
            for d in D:
                l = (possibledistricts[possibledistricts["name_hi"] == d]
                     ).index.tolist()  #Index of all matched rows
                for ll in l:  #for each index print corresponding State
                    print("State Code: ", cdf.at[ll, 'state_code'],
                          ", State: ", state.at[cdf.at[ll, 'state_code'],
                                                'name_hi'])
                    S.append(state.at[cdf.at[ll, 'state_code'], 'name_hi'])

        list_output.append(locations)
        print(S, D, SD, PT)
        if flag_perfectmatch:
            list_output.append("Yes")
            return (0, S, D, SD, PT)
        else:
            list_output.append("No")
            return (alpha, S, D, SD, PT)
    return
Beispiel #20
0
def load_data(folderpath):
    """
    Open CSV files in a folder and index the data.
    """
    files = os.listdir(folderpath)

    for f in files:

        fullname = folderpath + f
        if os.path.isfile(fullname):
            with open(fullname, 'rb') as csvfile:

                eventsreader = csv.reader(csvfile,
                                          delimiter=',',
                                          quotechar='"')
                for line, row in enumerate(eventsreader):
                    if line == 0:
                        for column in range(0, len(row)):
                            if row[column] == "TimeCreated":
                                TIME_IDX = column
                            elif row[column] == "Id":
                                ID_IDX = column
                            elif row[column] == "MachineName":
                                MACHINENAME_IDX = column
                            elif row[column] == "TargetUserName":
                                TARGETUSERNAME_IDX = column
                            elif row[column] == "SubjectUserName":
                                SUBJECTUSERNAME_IDX = column
                            elif row[column] == "LogonType":
                                LOGONTYPE_IDX = column
                            elif row[column] == "ProcessName":
                                PROCESSNAME_IDX = column
                            elif row[column] == "IpAddress":
                                IP_IDX = column
                            elif row[column] == "WorkstationName":
                                WORKSTATIONAME_IDX = column
                            elif row[column] == "ServiceName":
                                SERVICENAME_IDX = column
                            elif row[column] == "ImagePath":
                                IMAGEPATH_IDX = column
                            elif row[column] == "ServiceType":
                                SERVICETYPE_IDX = column
                            elif row[column] == "Status":
                                STATUS_IDX = column
                            elif row[column] == "SubStatus":
                                SUBSTATUS_IDX = column
                            elif row[column] == "TaskName":
                                TASKNAME_IDX = column
                            elif row[column] == "TaskContent":
                                TASKCONTENT_IDX = column
                            elif row[column] == "Message":
                                MESSAGGE_IDX = column
                            elif row[column] == "ShareName":
                                SHARENAME_IDX = column
                            elif row[column] == "PrivilegeList":
                                PRIVILEGE_IDX = column

                    print "reading ", f, " line:", line
                    #4624 Authentication event
                    if row[ID_IDX] == "4624":
                        strtime = row[TIME_IDX]
                        time = getdate(strtime)

                        hostname = row[MACHINENAME_IDX]
                        username = row[TARGETUSERNAME_IDX].lower()
                        logontype = getlogontype(str(row[LOGONTYPE_IDX]))
                        processname = row[PROCESSNAME_IDX]
                        sourceip = row[IP_IDX]
                        sourcehost = row[WORKSTATIONAME_IDX]

                        host, created_host = Host.objects.get_or_create(
                            hostname=hostname)
                        user, created_user = User.objects.get_or_create(
                            username=username)

                        if (logontype == "Network"
                                or logontype == "RemoteInteractive") and (
                                    sourceip != "-" and ":" not in sourceip):
                            source, created_source = SourceIp.objects.get_or_create(
                                sourceip=sourceip, hostname=sourcehost)
                            event4624, createdevent = Event_4624.objects.get_or_create(
                                time=time,
                                host=host,
                                user=user,
                                logontype=logontype,
                                processname=processname,
                                sourcehost=sourcehost,
                                sourceip=source)
                        else:
                            try:
                                event4624, createdevent = Event_4624.objects.get_or_create(
                                    time=time,
                                    host=host,
                                    user=user,
                                    logontype=logontype,
                                    processname=processname,
                                    sourcehost=sourcehost)
                            except:
                                pass
                        if createdevent:
                            print "new 4624 created for host " + hostname

                    # 4625 Authentication event
                    if row[ID_IDX] == "4625":
                        strtime = row[TIME_IDX]
                        time = getdate(strtime)
                        hostname = row[MACHINENAME_IDX]
                        username = row[TARGETUSERNAME_IDX].lower()
                        logontype = getlogontype(str(row[LOGONTYPE_IDX]))
                        processname = row[PROCESSNAME_IDX]
                        sourceip = row[IP_IDX]
                        sourcehost = row[WORKSTATIONAME_IDX]
                        status = row[STATUS_IDX]
                        substatus = row[SUBSTATUS_IDX]

                        host, created_host = Host.objects.get_or_create(
                            hostname=hostname)
                        user, created_user = User.objects.get_or_create(
                            username=username)

                        #new
                        if (logontype == "Network"
                                or logontype == "RemoteInteractive") and (
                                    sourceip != "-" and ":" not in sourceip):
                            source, created_source = SourceIp.objects.get_or_create(
                                sourceip=sourceip, hostname=sourcehost)
                            event4625, createdevent = Event_4625.objects.get_or_create(
                                time=time,
                                host=host,
                                user=user,
                                logontype=logontype,
                                processname=processname,
                                sourcehost=sourcehost,
                                sourceip=source,
                                status=status,
                                substatus=substatus)
                        else:
                            event4625, createdevent = Event_4625.objects.get_or_create(
                                time=time,
                                host=host,
                                user=user,
                                logontype=logontype,
                                processname=processname,
                                sourcehost=sourcehost,
                                status=status,
                                substatus=substatus)

                        if createdevent:
                            print "new 4625 created for host " + hostname

                    # 4776 Local Authentication Event
                    if row[ID_IDX] == "4776":
                        strtime = row[TIME_IDX]
                        time = getdate(strtime)

                        hostname = row[MACHINENAME_IDX]
                        username = row[TARGETUSERNAME_IDX].lower()
                        status = row[STATUS_IDX]

                        host, created_host = Host.objects.get_or_create(
                            hostname=hostname)
                        user, created_user = User.objects.get_or_create(
                            username=username)

                        event4776, createdevent = Event_4776.objects.get_or_create(
                            time=time, host=host, user=user, status=status)
                        if createdevent:
                            print "new 4776 created for host " + hostname

                    # 5140 File Share Event
                    if row[ID_IDX] == "5140":
                        strtime = row[TIME_IDX]
                        time = getdate(strtime)

                        hostname = row[MACHINENAME_IDX]
                        username = row[SUBJECTUSERNAME_IDX].lower()
                        sharename = row[SHARENAME_IDX]

                        host, created_host = Host.objects.get_or_create(
                            hostname=hostname)
                        user, created_user = User.objects.get_or_create(
                            username=username)

                        event5140, createdevent = Event_5140.objects.get_or_create(
                            time=time,
                            host=host,
                            user=user,
                            sharename=sharename)
                        if createdevent:
                            print "new 5140 created for host " + hostname

                    # 4672 Privileged Auth
                    if row[ID_IDX] == "4672":
                        strtime = row[TIME_IDX]
                        time = getdate(strtime)

                        hostname = row[MACHINENAME_IDX]
                        username = row[SUBJECTUSERNAME_IDX].lower()
                        privilegelist = row[PRIVILEGE_IDX]

                        host, created_host = Host.objects.get_or_create(
                            hostname=hostname)
                        user, created_user = User.objects.get_or_create(
                            username=username)

                        event4672, createdevent = Event_4672.objects.get_or_create(
                            time=time,
                            host=host,
                            user=user,
                            privilegelist=privilegelist)
                        if createdevent:
                            print "new 4672 created for host " + hostname

                    # 7045 New Service created event
                    elif row[ID_IDX] == "7045":

                        strtime = row[TIME_IDX]
                        time = getdate(strtime)
                        hostname = row[MACHINENAME_IDX]
                        servicename = row[SERVICENAME_IDX]
                        imagepath = row[IMAGEPATH_IDX]
                        servicetype = row[SERVICETYPE_IDX]

                        service, service_created = Service.objects.get_or_create(
                            servicename=servicename,
                            imagepath=imagepath,
                            servicetype=servicetype)
                        host, created_host = Host.objects.get_or_create(
                            hostname=hostname)
                        event7045, createdevent = Event_7045.objects.get_or_create(
                            time=time, host=host, service=service)
                        if createdevent:
                            print "new 7045 created for " + hostname

                    # 4698 New Scheduled Task event
                    elif row[ID_IDX] == "4698":

                        strtime = row[TIME_IDX]
                        time = getdate(strtime)

                        hostname = row[MACHINENAME_IDX]
                        taskname = row[TASKNAME_IDX]
                        taskdetails = row[TASKCONTENT_IDX]
                        #arguments = re.findall('<Arguments>(.*?)</Arguments>', taskdetails, re.DOTALL)[0]
                        #cmdline = re.findall('<Command>(.*?)</Command>', taskdetails, re.DOTALL)[0]
                        try:
                            arguments = find_between(taskdetails,
                                                     "<Arguments>",
                                                     "</Arguments>")
                        except:
                            arguments = ""
                        try:
                            cmdline = find_between(taskdetails, "<Command>",
                                                   "</Command>")
                        except:
                            cmdline = ""
                        host, created_host = Host.objects.get_or_create(
                            hostname=hostname)
                        task, taskcreated = Task.objects.get_or_create(
                            taskname=taskname,
                            imagepath=cmdline,
                            arguments=arguments)
                        event4698, createdevent = Event_4698.objects.get_or_create(
                            time=time, host=host, task=task)
                        if createdevent:
                            print "new 4698 created for " + hostname

                    # WMI event
                    elif row[ID_IDX] == "2":

                        strtime = row[TIME_IDX]
                        time = getdate(strtime)

                        hostname = row[MACHINENAME_IDX]
                        wmidetails = row[MESSAGGE_IDX]
                        operationid = re.findall('GroupOperationId = (.*?);',
                                                 wmidetails, re.DOTALL)[0]
                        operation = re.findall('Operation = (.*?);',
                                               wmidetails, re.DOTALL)[0]
                        host, created_host = Host.objects.get_or_create(
                            hostname=hostname)
                        WmiEvent_2.objects.create(time=time,
                                                  host=host,
                                                  operationid=operationid,
                                                  operation=operation)
                        print "new Wmi Event 2 created for " + hostname
def parser(log, format="str"):
    row = ""
    splitted = log.split()
    gr2 = group_by_quote.search(log).group().replace('"', '').split()
    # Request ID
    request_id = str(uuid.uuid4())
    # Visitor ID
    visitor_id = id_generator()
    # UserID
    user_id = get_next_user_id().next()
    process_name = "nginx"

    ip = splitted[0]

    request_type = gr2[0]

    # Get url with different way to deal with database.
    url = urlparse.urlparse(gr2[1]).path
    # 06/Nov/2015:06:47:45 +0000
    # str_dt = re.search(r'[0-9]{2}\/.*\/[0-9]{4}\:[0-9]{2}\:[0-9]{2}\:[0-9]{2}\s\+[\d]+', log).group(),
    # str_dt = datetime_parser.search(log).group(),
    str_dt = find_between(log, "[", "]").lstrip()

    dt_format = "%d/%b/%Y:%H:%M:%S +0000"

    dt = datetime.strptime(str_dt, dt_format)
    try:
        dt = dt.replace(tzinfo=tz.gettz('UTC')).astimezone(timezone)
    except:
        import traceback
        traceback.print_exc()
    # row += "%s|" % dt.strftime("%Y-%m-%d %H:%M:%S")
    status = int(gr2[3])
    # row += "%s|" % status

    params = create_param(urlparse.parse_qs(urlparse.urlparse(gr2[1]).query))
    redirection_url = gr2[5]
    if redirection_url == "-":
        redirection_url = ""

    # prepare data.
    params["request_id"] = request_id
    params["redirection_url"] = redirection_url
    params["user_id"] = user_id if user_id else 'NULL'
    params["visitor_id"] = visitor_id
    params["process_name"] = process_name
    params["ip"] = ip
    params["request_type"] = request_type
    params["url"] = url
    params["event_at"] = "%s" % dt.strftime("%Y-%m-%d %H:%M:%S")
    params["http_status"] = status
    params["created_at"] = datetime.now().strftime("%Y-%m-%d %H:%M:%S")

    row = ""
    if format == "str":
        row = get_row_format().format(**params)
    elif format == "tuple":
        row = params.values()
    elif format == "json":
        return params

    return row
Beispiel #22
0
 def set_name(self):
     begin = 'target="_blank" class="tab-link">'
     end = '</a>'
     name = find_between( self.html, begin, end )
     self.name = name.replace('<b>','').replace('</b>','').strip()
Beispiel #23
0
        f for f in listdir(cfg.PATH_OUTPUT)
        if (isfile(join(cfg.PATH_OUTPUT, f)) & (f.endswith('.txt')))
]:
    results = dict()
    title = ''
    f = ''
    fstd = ''
    mae = ''
    maestd = ''
    for line in open(cfg.PATH_OUTPUT + fName, 'r'):
        line = line.replace('\n', '')
        if ('Executing for' in line):
            #write away previous results
            results[title] = [mae, maestd, f, fstd]

            title = 'cross_' + utils.find_between(line, 'Executing for ',
                                                  'model').strip()
        elif ('Average Mean absolute error (official): ' in line):
            mae = line.split("(official): ", 1)[1]
        elif ('Average Mean absolute error (official); std: ' in line):
            maestd = line.split('std: ', 1)[1]
        elif ('Average F1-measure: ' in line):
            f = line.split("measure: ", 1)[1]
        elif ('Average F1-measure; std: ' in line):
            fstd = line.split("std: ", 1)[1]
        elif ("Scores for " in line):
            #write away previous, doing for test now
            results[title] = [mae, maestd, f, fstd]

            title = 'test_' + utils.find_between(line, 'Scores for ',
                                                 ':').strip()
    results[title] = [mae, maestd, f, fstd]