def search_demo(api): results = api.GetSearch(term='politics', count=MAX_NUM_SEARCH_RESULTS) results = filter(lambda x: is_ascii(x.text) and is_ascii(x.user.name), results) print 'Twitter search results for politics...\n' for res in results: print 'This {0}tweet was written by {1}'.format( 'possibly sensitive ' if res.possibly_sensitive else '', res.user.name) print 'It contains the following (expanded out) urls ->' for url in res.urls: print '\t' + url.expanded_url print 'The tweet itself is ->\n\t{0}'.format(res.text) print ''
def process_country_gender(params): with codecs.open(params['process_country_gender_input_file_zh'], encoding='utf-8', mode='r') as fin: with open(params['process_country_gender_output_file_gender_zh'], 'w') as fgender: with open(params['process_country_gender_output_file_country_zh'], 'w') as fcountry: line_idx = 0 male_total = 0 female_total = 0 for line in fin: line_idx += 1 if line_idx % 10000 == 0: print '%d lines processed' %line_idx id, name, place_of_birth, nationality, male_cnt, female_cnt = line.strip().split('\t') if is_chinese(name): name = convert_to_pinyin(name) if utils.is_ascii(name): confidence, country = get_place_of_birth(place_of_birth) if (confidence > 0 and 'process_country_gender_confidence' not in params) or ('process_country_gender_confidence' in params and confidence > params['process_country_gender_confidence']): fcountry.write('%s\t%d' %(name, country)) else: confidence, country = get_nationality(nationality) if (confidence > 0 and 'process_country_gender_confidence' not in params) or ('process_country_gender_confidence' in params and confidence > params['process_country_gender_confidence']): fcountry.write('%s\t%d' %(name, country)) confidence, gender = get_gender(male_cnt, female_cnt) if (confidence > 0 and 'process_country_gender_confidence' not in params) or ('process_country_gender_confidence' in params and confidence > params['process_country_gender_confidence']): fgender.write('%s\t%d\n' %(name, gender)) if gender == 1: male_total += 1 else: female_total += 1 print 'Male: %d, Female: %d\n' %(male_total, female_total)
def write_transfer_data_file_simple_text(transfer_data_file, transfer_content, type_value, with_cve_db=False, cve='', title_or_content='c', db=''): append_str = '' if with_cve_db: append_str = ' ' + cve + ' ' + title_or_content + ' ' + db words = nltk.word_tokenize(transfer_content) idx = 0 for ww in words: if utils.is_ascii(ww) and (not ww.startswith('www.')): if type_value in ['vulnerable_']: if ww == 'Access': transfer_data_file.write(ww + ' ' + 'S-' + type_value + 'software' + ' O' + append_str + '\n') else: transfer_data_file.write(ww + ' ' + 'O' + ' O' + append_str + '\n') else: transfer_data_file.write(ww + ' ' + 'O' + ' O' + append_str + '\n') idx += 1
def __non_alpha_filter(self, text: str) -> (str, dict): non_ascii_locs = {} result_text_buffer = StringIO() for i, c in enumerate(text): if not utils.is_ascii(c) or not c.isalpha(): non_ascii_locs[i] = c else: result_text_buffer.write(c) return result_text_buffer.getvalue(), non_ascii_locs
def trending_demo(api): # Gets worldwide trending topics. We can also specify a region (like USA or California or SF) trends = filter(lambda t: is_ascii(t.name), api.GetTrendsCurrent()) print 'Trending topics' print '---------------' for trend in trends[:NUM_TRENDING_TOPICS]: print '{0} ->\n\tquery = {1}\n\turl = {2}\n\t# tweets = {3}\n'.format( trend.name, trend.query, trend.url, trend.tweet_volume)
def make_one_hot_encoding(self, movie_info): encoded = { "year": movie_info["year"].tolist(), "average_rating": movie_info["average_rating"].tolist() } row_count = movie_info.shape[0] for numeric_column in ["year", "average_rating"]: encoded[numeric_column] = movie_info[numeric_column].tolist() for index, movie in movie_info.iterrows(): for list_like_category in ["genres", "cast"]: for item in movie[list_like_category]: item = "Unknown" if (item == "" or not utils.is_ascii(item)) else item one_hot_colname = "{}_{}".format(list_like_category, item) if one_hot_colname not in encoded.keys(): encoded[one_hot_colname] = [0] * row_count encoded[one_hot_colname][int(index)] = 1 for single_category in ["producer", "writer", "composer"]: item = "Unknown" if (movie[single_category] == "" or not utils.is_ascii(movie[single_category])) else movie[single_category] one_hot_colname = "{}_{}".format(single_category, item) if one_hot_colname not in encoded.keys(): encoded[one_hot_colname] = [0] * row_count encoded[one_hot_colname][int(index)] = 1 return pd.DataFrame(encoded)
def post_beacon(self): """API used to send captured beacons from LP to Controller""" beacon = json_to_beacon(request.data) # Check if implant already exists implant = db.session.query(Implant).filter_by( uuid=beacon['uuid']).first() if implant: # Update existing implant implant.last_beacon_received = datetime.now() implant.external_ip_address = beacon['external_ip_address'] db.session.commit() else: # Add new implant implant = Implant(uuid=beacon['uuid']) db.session.add(implant) db.session.commit() # Store beacon data if 'data' in beacon: beacon_data = beacon['data'] if beacon_data: datastore = DataStore( implant=[implant], timestamp=datetime.now()) if is_ascii(beacon_data): datastore.text_received = beacon_data else: datastore.data_received = beacon_data db.session.add(datastore) db.session.commit() http_return_code = 200 response = make_response('Success', http_return_code) return response
def post_beacon(self): """API used to send captured beacons from LP to Controller""" beacon = json_to_beacon(request.data) # Check if implant already exists implant = db.session.query(Implant).filter_by( uuid=beacon['uuid']).first() if implant: # Update existing implant implant.last_beacon_received = datetime.now() implant.external_ip_address = beacon['external_ip_address'] db.session.commit() else: # Add new implant implant = Implant(uuid=beacon['uuid']) db.session.add(implant) db.session.commit() # Store beacon data if 'data' in beacon: beacon_data = beacon['data'] if beacon_data: datastore = DataStore(implant=[implant], timestamp=datetime.now()) if is_ascii(beacon_data): datastore.text_received = beacon_data else: datastore.data_received = beacon_data db.session.add(datastore) db.session.commit() http_return_code = 200 response = make_response('Success', http_return_code) return response
def test_is_ascii(): assert is_ascii('abcd123') assert not is_ascii('가나다') assert not is_ascii(u'가나다')
def process_country_gender(params): assert 'process_country_gender_countries' in params assert 'process_country_gender_cities' in params country_dict = {} city_dict = {} country_id = {} with codecs.open(params['process_country_gender_countries'], encoding='utf-8', mode='r') as fin: lines = fin.readlines() for i in range(1,len(lines)): line = lines[i] fips,iso,tsd,country = line.strip().split('\t') country = country.lower() country_dict[country] = i-1 country_id[fips] = country with codecs.open(params['process_country_gender_cities'], encoding='utf-8', mode='r') as fin: city_dict_tmp = {} lines = fin.readlines() for i in range(1,len(lines)): line = lines[i] if len(line.strip().split('\t')) != 2: continue fips,city = line.strip().split('\t') city = city.lower() if city in city_dict_tmp: city_dict_tmp[city] = 'NOTACOUNTRY' else: city_dict_tmp[city] = country_id[fips] with codecs.open(params['process_country_gender_output_country_id'], encoding='utf-8', mode='w') as fout: countries = country_dict.keys() countries = sorted(countries) for country in countries: fout.write('%s\t%d\n'%(country, country_dict[country])) city_dict = {key:city_dict_tmp[key] for key in city_dict_tmp if not city_dict_tmp[key] == 'NOTACOUNTRY'} city_dict_opt = {} for city in city_dict: city_tmp = city_dict_opt for i in range(len(city)): c = city[i] if c not in city_tmp: city_tmp[c] = ['N',{}] if i == len(city)-1: city_tmp[c][0] = country_dict[city_dict[city]] city_tmp = city_tmp[c][1] city_dict = city_dict_opt with codecs.open(params['process_country_gender_input_file'], encoding='utf-8', mode='r') as fin: with open(params['process_country_gender_output_file_gender'], 'w') as fgender: with open(params['process_country_gender_output_file_country'], 'w') as fcountry: line_idx = 0 male_total = 0 female_total = 0 for line in fin: line_idx += 1 if line_idx % 10000 == 0: print '%d lines processed' %line_idx id, name, place_of_birth, nationality, male_cnt, female_cnt = line.strip().split('\t') if utils.is_ascii(name): confidence, country = get_place_of_birth(place_of_birth, city_dict, country_dict) if (confidence > 0 and 'process_country_gender_confidence' not in params) or ('process_country_gender_confidence' in params and confidence > params['process_country_gender_confidence']): fcountry.write('%s\t%d\n' %(name, country)) else: confidence, country = get_nationality(nationality, city_dict, country_dict) if (confidence > 0 and 'process_country_gender_confidence' not in params) or ('process_country_gender_confidence' in params and confidence > params['process_country_gender_confidence']): fcountry.write('%s\t%d\n' %(name, country)) confidence, gender = get_gender(male_cnt, female_cnt) if (confidence > 0 and 'process_country_gender_confidence' not in params) or ('process_country_gender_confidence' in params and confidence > params['process_country_gender_confidence']): fgender.write('%s\t%d\n' %(name, gender)) if gender == 1: male_total += 1 else: female_total += 1 print 'Male: %d, Female: %d\n' %(male_total, female_total)
def normalize_categorical(string): return string if (utils.is_ascii(string) and len(string) > 0) else "Unknown"