def prep_topic_analysis(pathDataIn, pathDataOut, data_source_tag, onlyUnique=True): visits = set() #with open(pathDataIn, 'r') as infile: with subprocess.Popen(["tail", "-r", pathDataIn], stdout=subprocess.PIPE, universal_newlines=True).stdout as infile: with open(pathDataOut, 'w') as outfile: mcdata_csv = csv.writer(outfile, delimiter='\t') for mcline in infile: mc = json.loads(mcline) if not mc.get('is_wanted', False) and mc.get( 'selftext', False) and len(mc['selftext']) > 0: visit = standardize_address(mc.get('mc_addr', '')) if not onlyUnique or not visit in visits: visits.add(visit) try: mcdata_csv.writerow([ mc.get('post_uid', visit), visit, data_source_tag, (mc['title'] + ' ' + mc['selftext']).replace( '\n', ' ').replace('"', "'").replace( '[[', ' ').replace(']]', ' ').replace( '**', ' ').replace('[', ' ').replace( ']', ' ').replace('*', ' '). replace('.', ' ').replace('/', ' ').replace( '', '').replace('!', " ").replace( '(', " ").replace(')', " ").replace(':', ' ') ]) #, "srv_repsample" )) except UnicodeEncodeError: pprint(mc) raise print(len(visits))
def merge_sniffer_into_core_json(d_mcsniff, s_in_file, s_out_file): mc_addrs_sniff = d_mcsniff.keys() mc_json_out = [] print(len(mc_addrs_sniff)) counter = 0 counter2 = 0 with open(s_in_file, 'r') as infile: for line in infile: counter += 1 #if counter % 100 != 0: continue mc = ujson.loads(line) interesting_text = {} if 'motd' in mc: if type(mc['motd']) == type(''): interesting_text['motd'] = mc['motd'] elif type(mc['motd']) == type({}): interesting_text['motd'] = mc['motd']['text'] if 'website_url' in mc and len(mc['website_url']) > 0: interesting_text['website'] = 'website' alternative_dependents = {} if 'votes' in mc: alternative_dependents['votes'] = mc['votes'] if 'updated' in mc: alternative_dependents['updated'] = mc['updated'] if 'rank' in mc: alternative_dependents['rank'] = mc['rank'] ### sniff fields are (from 20160712 data collected on 20160719) #{ 'host': {"<class 'str'>": 97267}, #'id': {"<class 'int'>": 97267}, #'online': {"<class 'bool'>": 97267}, #'port': {"<class 'int'>": 97267}, #'signs': {"<class 'list'>": 97267}, #'state': {"<class 'str'>": 97267}, #'description': {"<class 'NoneType'>": 257, "<class 'str'>": 97010}, #'protocol_version': {"<class 'NoneType'>": 267, "<class 'int'>": 97000}, #'version': {"<class 'NoneType'>": 267, "<class 'str'>": 97000}, #'max_player_count': {"<class 'NoneType'>": 276, "<class 'int'>": 96991}, #'player_count': {"<class 'NoneType'>": 276, "<class 'int'>": 96991}, #'error': {"<class 'NoneType'>": 9455, "<class 'str'>": 87812}, #'plugins_fml': {"<class 'NoneType'>": 69432, "<class 'list'>": 27835}, #'whitelist': {"<class 'NoneType'>": 78537, "<class 'bool'>": 18730}, #'difficulty': {"<class 'NoneType'>": 86782, "<class 'int'>": 10485}, #'gamemode': {"<class 'NoneType'>": 86782, "<class 'int'>": 10485}, #'hardcore': {"<class 'NoneType'>": 86782, "<class 'bool'>": 10485}, #'level_type': {"<class 'NoneType'>": 86782, "<class 'str'>": 10485}, #'brand': {"<class 'NoneType'>": 86797, "<class 'str'>": 10470}, #'players': {"<class 'NoneType'>": 87392, "<class 'list'>": 9875}, #'help_p1': {"<class 'NoneType'>": 91787, "<class 'str'>": 5480}, #'software': {"<class 'NoneType'>": 94398, "<class 'str'>": 2869}, #'plugins': {"<class 'NoneType'>": 94820, "<class 'str'>": 2447}, #'welcome': {"<class 'NoneType'>": 96224, "<class 'str'>": 1043}} if 'mc_addr' in mc and standardize_address( mc['mc_addr']) in mc_addrs_sniff: mc_snf = d_mcsniff[standardize_address(mc['mc_addr'])] counter2 += 1 mc['reported_sniff'] = True mc['whitelist'] = mc_snf['whitelist'] #if 'protocol_version' in mc_snf: mc['snf_protocol_v'] = mc_snf['protocol_version'] if 'server_version_number' not in mc: if 'version' in mc: mc['server_version_number'] = mc['version'] elif 'game_query' in mc and 'version' in mc['game_query']: mc['server_version_number'] = mc['game_query'][ 'version'] elif 'game_query' in mc and ( 'server_mod_version' in mc['game_query'] or 'server_mod_name' in mc['game_query']): mc['server_version_number'] = mc['game_query'].get( 'server_mod_name', '') + '_' + mc['game_query'].get( 'server_mod_version', '') elif 'version' in mc_snf: mc['server_version_number'] = mc_snf['version'] if 'description' in mc_snf and 'description' not in mc: mc['description'] = mc_snf['description'] if 'description' in mc: interesting_text['description'] = mc['description'] if 'welcome' in mc_snf: interesting_text['welcome'] = mc_snf['welcome'] if 'help_p1' in mc_snf: interesting_text['help_p1'] = mc_snf['help_p1'] if 'difficulty' in mc_snf: mc['snf_difficulty'] = mc_snf['difficulty'] if 'gamemode' in mc_snf: mc['snf_gamemode'] = mc_snf['gamemode'] if 'hardcore' in mc_snf: mc['snf_hardcore'] = mc_snf['hardcore'] if 'level_type' in mc_snf: mc['snf_level_type'] = mc_snf['level_type'] if 'brand' in mc_snf: mc['snf_brand'] = mc_snf['brand'] if 'software' in mc_snf: mc['snf_software'] = mc_snf['software'] # ### this is probably wrong, and I probably don't want it, but ### if I get to wanting it, I'll do it here someway like this. #mc['plugins_names'].append(mc_snf['plugins_fml']) ### this is complicated because its non default, and many ### people use it to say many things, and its sometimes empty, ### and I don't want to add to existing plugin_names list a ### plugin already listed plugins_text_sniff = mc_snf['plugins'] if plugins_text_sniff is not None and plugins_text_sniff != '': if re.match("^Plugins (\d+): ", plugins_text_sniff): plugin_names_sniff = [ plug_name.strip() for plug_name in plugins_text_sniff.partition(':')[2].split(',') ] plugin_names_omni = mc['plugins_names'] mc['plugins_names'] = set(plugin_names_omni + plugin_names_sniff) mc['reported_plugins'] = True else: interesting_text[ 'plugins_override'] = plugins_text_sniff ### now handle signs if 'signs' in mc_snf: interesting_text['signs'] = [] num_signs = 0 for sign in mc_snf['signs']: if len(''.join(sign['lines'])) == 0: ### empty sign mc_signtext = '' continue ### decided that I don't want these else: mc_signtext = '\\\\'.join( sign['lines']).strip('\\\\') num_signs += 1 interesting_text['signs'].append(mc_signtext) if num_signs == 0: interesting_text.pop( 'signs', None) ### in case all signs were empty mc['snf_signs_count'] = num_signs else: mc['snf_signs_count'] = None else: mc['reported_sniff'] = False mc['text_short'] = interesting_text mc_json_out.append(mc) #json.dump(mc, outfile) #outfile.write("\n") with open(s_out_file, 'w') as outfile: #json.dump(mc_json_out, outfile) for mc in mc_json_out: outfile.write(ujson.dumps(mc)) outfile.write("\n") print('second number gives number of servers matched to sniff data') print(counter, counter2)
if failed: row = get_valid_json_with_halflines( line, (start_char + i_second_open_bracket + 1)) return (row) ### load sniffer json into a giant dict indexed by theserver location sniffer_dataset = get_freshest_data_date("lib_datasets_sniffer.txt") d_mcsniff = {} with open(pathData + 'mcsniffer/' + sniffer_dataset + '/' + "out_servers.json", 'r') as f_mcdata_in: for line in f_mcdata_in: mc = get_valid_json_with_halflines(line.strip()) if not mc: continue mc['mc_addr'] = standardize_address(mc['host'] + ':' + str(mc['port'])) d_mcsniff[mc['mc_addr']] = mc ### use that to structure merge into existing json def merge_sniffer_into_core_json(d_mcsniff, s_in_file, s_out_file): mc_addrs_sniff = d_mcsniff.keys() mc_json_out = [] print(len(mc_addrs_sniff)) counter = 0 counter2 = 0 with open(s_in_file, 'r') as infile: for line in infile: counter += 1 #if counter % 100 != 0: continue mc = ujson.loads(line)
### use that to structure merge into existing json print(len(d_mcs_org.keys())) print("merge mcs.org into concatenated omni logs") counter = 0 counter2 = 0 mc_json_out = [] copyfile(pathData+"step3_scraped_omnimc_posts"+".json", pathData+"tmp_step3_scraped_omnimc_posts"+".json") with open(pathData+"tmp_step3_scraped_omnimc_posts"+".json", 'r') as infile: for line in infile: counter += 1 mc = ujson.loads(line) ### merging of relevant fields, incl ["id", "title", "selftext", "primary_tags", "ip", "port", "version", "banner", "created", "updated", "youtube_video", "website_url", "country_code", "votes", "rank", "uptime", "totaltime", "daily_uptime", "daily_totaltime"] mco = False #print(mc['dataset_date'], map_omni_to_mcs_org.get(mc['dataset_date'], False), True if d_mcs_org.get(standardize_address(mc['mc_addr'])+'_'+map_omni_to_mcs_org[mc['dataset_date']], False) else False) if map_omni_to_mcs_org.get(mc['dataset_date'], False): if d_mcs_org.get(standardize_address(mc['mc_addr'])+'_'+map_omni_to_mcs_org[mc['dataset_date']], False): mco = d_mcs_org[ standardize_address(mc['mc_addr'])+'_'+map_omni_to_mcs_org[ mc['dataset_date'] ] ] if mco: counter2 += 1 mc['dataset_source'] = 'mcs_org' mc['title'] = mco['title'] mc['selftext'] = mco['description'] mc['primary_tags'] = mco['tags'] mc['ip'] = mco['ip'].rstrip() mc['port'] = mco['port'] mc['server_version_number'] = mco['version'] mc['banner'] = mco['banner'] mc['created'] = mco['created'] mc['updated'] = mco['updated'] mc['youtube_video'] = mco['youtube_video'] mc['website_url'] = mco['website_url']
def write_scrape_csv_row(dObs, playerWriter, serverWriter): dObs['mc_addr'] = standardize_address(dObs['mc_addr']) npopObs = len( dObs['players'] ) for p in dObs['players']: playerWriter.writerow([dObs['timestamp'], dObs['mc_addr'], hashlib.md5(p.encode('utf-8')).hexdigest()]) hackedAPI = ( len( dObs['players'] ) == 0 or ### not sure why, but this is a reliable signal of a hacked API (len( dObs['players'] ) == 1 and dObs['players_online'] > 3) or ### must be equal (len( dObs['players'] ) > 1 and dObs['players_online'] - 10 > dObs['players_online']) or ### must be close (dObs['players_online'] > 0 and len( dObs['players'] ) - 9 > dObs['players_online']) or ### must be close, or, if players_online is zero, then player list is actually still trustworthy and mismatches are OK. This actually has some false positives, paticularly among big servers. the general rule remains that a mod to the API is disqualifying, unless I have guarantee for a spcial case (like zero ) taht API modifications are safe (len( dObs['players'] ) > 50 and len( dObs['players'] ) - 20 > dObs['players_online']) or ### and the tolerance is bigger for bigger servers, because more room for lag to affect synching of counts dObs['players_max'] + 2 < len( dObs['players'] ) or ### don't exceed max (plus/minus noise/lag) dObs['players_max'] + 2 < dObs['players_online'] or dObs['players_max'] <= 0 or ### negative and zero are impossible dObs['players_online'] < 0 or ### negative is impossible (len( dObs['players'] ) == 1 and len( dObs['players'][0]) < 3) or #### player array replace by int (18,14,10,1,or 0). When this happens, length of list is never greater than 1 dObs['mc_addr'].lower() in ( "131.153.5.218", "alpa.playmcm.net", "playmcm.net", "pvp.originmc.org" ) ) #if dObs['players_online'] < len(dObs['players']): #if (dObs['players_online'] - 1000) > len(dObs['players']): #if len (dObs['players']) > 1 and len(dObs['players'][0]) < 3: #print("xxx", len(dObs['players']), dObs['players_online'], dObs['players_max'], dObs['players']) #if dObs['players_online'] + 10 < len(dObs['players']): #print("yyy", len(dObs['players']), dObs['players_online'], dObs['players_max'], dObs['players']) #if len (dObs['players']) > 1: #print( dObs['players']) #print(len(dObs['players']), dObs['players_online'], dObs['players_max'], dObs) #print() serverWriter.writerow([dObs['timestamp'], dObs['mc_addr'] , 1 if dObs['reported_status'] else 0 , 1 if dObs['reported_sample'] else 0 , 1 if dObs['reported_query'] else 0 , dObs['players_max'] , len(dObs['players']) , dObs.get('latency', -1) , hackedAPI ]) statistics = {} if True: #if len(dObs['players']) == 1 and len(dObs['players'][0]) < 10: print(dObs['players']) statistics['countInternal'] = 1 ## always one or the other of these two: statistics['playerKeyInRow'] = 1 if 'players' in dObs else 0 statistics['playerKeyNotInRow'] = 1 if not 'players' in dObs else 0 ### always one or the other of these three ### if list is empty, 10:1 chances that players_online is a lie. ### so I'm using emptiness as one flag of hacked APIS statistics['playerListEmpty'] = 1 if len( dObs['players'] ) == 0 else 0 statistics['playerListLen1'] = 1 if len( dObs['players'] ) == 1 else 0 statistics['playerListLenBig'] = 1 if len( dObs['players'] ) > 1 else 0 #statistics['playerListLenCorrected'] = len( dObs['players'] )- (1 if "00000000-0000-0000-0000-000000000000" in dObs['players'] else 0) #### this isn't a sign of badness, just a sign of a certain type of plugin installed, in which case it means the op is online statistics['playerListDummy'] = 1 if "00000000-0000-0000-0000-000000000000" in dObs['players'] else 0 statistics['playerListDummyEmbedded'] = 1 if statistics['playerListDummy'] and statistics['playerListLenBig'] else 0 statistics['playerListIntDummy'] = 1 if len(dObs['players']) > 0 and len(dObs['players'][0]) < 3 else 0 ### always true statistics['playersReported'] = 1 if 'players_online' in dObs else 0 ### usually true statistics['playersReportedEqualTruth'] = 1 if dObs['players_online'] == len( dObs['players'] ) else 0 #### these are flags of a hacked API statistics['playersReportedOverTruth'] = 1 if dObs['players_online'] > len( dObs['players'] ) else 0 statistics['playersReportedUnderTruth'] = 1 if dObs['players_online'] < len( dObs['players'] ) else 0 ### always false statistics['playersReportedNull'] = 1 if dObs['players_online'] is None else 0 statistics['playersReportedFalse'] = 1 if dObs['players_online'] is False else 0 statistics['playersReported0'] = 1 if dObs['players_online'] == 0 else 0 statistics['playersReported0Alone'] = 1 if dObs['players_online'] == 0 and len( dObs['players'] ) > 0 else 0 ### true 9 times out of 10 statistics['playersReportedNotEqualTruth0'] = 1 if dObs['players_online'] != 0 and len( dObs['players'] ) == 0 else 0 statistics['playersReportedNegative'] = 1 if dObs['players_online'] < 0 else 0 statistics['playersMaxNegative'] = 1 if dObs['players_max'] < 0 else 0 ### this is never true statistics['apimod1'] = 1 if not dObs['reported_sample'] and len( dObs['players'] ) > 0 else 0 ### this is most often true statistics['apimod2'] = 1 if not dObs['reported_query'] and len( dObs['players'] ) > 0 else 0 ### these are rarely true, and sings of a hacked API statistics['apimod3'] = 1 if dObs['players_max'] < len( dObs['players'] ) else 0 statistics['apimod4'] = 1 if dObs['players_max'] < dObs['players_online'] else 0 statistics['apimod5'] = 1 if hackedAPI else 0 return(statistics)
if len(mc) != 19: print("PROBLEM at {rc}, row length:{rlength}".format( rc=i_rowcount, rlength=len(mc))) print('\n'.join(mc)) for key, entry in iter(mc.items()): if entry == 'N': mc[key] = None if mc['id'] == None: #i_badrowcount += 1 #print "bad row:", mc) continue i_rowcount += 1 #if i_rowcount > 100: break ### data formatting mc['id'] = int(mc['id']) mc['mc_addr'] = standardize_address(mc['ip'] + ':' + mc['port']) mc['port'] = int(mc['port']) mc['dataset_date'] = thedate mc['dataset_source'] = 'mcs_org' mc['date_created'] = parse(mc['updated']).strftime('%Y%m%d') mc['post_uid'] = mc['mc_addr'] + '_' + mc['dataset_date'] mc['votes'] = int(mc['votes']) mc['rank'] = int(mc['rank']) mc['totaltime'] = int(mc['totaltime']) mc['daily_totaltime'] = int(mc['daily_totaltime']) mc['uptime'] = int(mc['uptime']) mc['daily_uptime'] = int(mc['daily_uptime']) mc['tags'] = [tag.strip() for tag in mc['tags'].split(',')] ### new fields #mc['measure_one'] = str(mc['id']) in l_ids_obs1 #mc['measure_two'] = str(mc['id']) in l_ids_obs2