def search_demo(api):
    results = api.GetSearch(term='politics', count=MAX_NUM_SEARCH_RESULTS)
    results = filter(lambda x: is_ascii(x.text) and is_ascii(x.user.name),
                     results)

    print 'Twitter search results for politics...\n'
    for res in results:
        print 'This {0}tweet was written by {1}'.format(
            'possibly sensitive ' if res.possibly_sensitive else '',
            res.user.name)
        print 'It contains the following (expanded out) urls ->'
        for url in res.urls:
            print '\t' + url.expanded_url
        print 'The tweet itself is ->\n\t{0}'.format(res.text)
        print ''
def process_country_gender(params):
    with codecs.open(params['process_country_gender_input_file_zh'], encoding='utf-8', mode='r') as fin:
        with open(params['process_country_gender_output_file_gender_zh'], 'w') as fgender:
            with open(params['process_country_gender_output_file_country_zh'], 'w') as fcountry:
                line_idx = 0
                male_total = 0
                female_total = 0
                for line in fin:
                    line_idx += 1
                    if line_idx % 10000 == 0:
                        print '%d lines processed' %line_idx
                    id, name, place_of_birth, nationality, male_cnt, female_cnt = line.strip().split('\t')
                    if is_chinese(name):
                        name = convert_to_pinyin(name)
                        if utils.is_ascii(name):
                            confidence, country = get_place_of_birth(place_of_birth)
                            if (confidence > 0 and 'process_country_gender_confidence' not in params) or ('process_country_gender_confidence' in params and confidence > params['process_country_gender_confidence']):
                                fcountry.write('%s\t%d' %(name, country))
                            else:
                                confidence, country = get_nationality(nationality)
                                if (confidence > 0 and 'process_country_gender_confidence' not in params) or ('process_country_gender_confidence' in params and confidence > params['process_country_gender_confidence']):
                                    fcountry.write('%s\t%d' %(name, country))
                            confidence, gender = get_gender(male_cnt, female_cnt)
                            if (confidence > 0 and 'process_country_gender_confidence' not in params) or ('process_country_gender_confidence' in params and confidence > params['process_country_gender_confidence']):
                                fgender.write('%s\t%d\n' %(name, gender))
                                if gender == 1:
                                    male_total += 1
                                else:
                                    female_total += 1
                print 'Male: %d, Female: %d\n' %(male_total, female_total)
def write_transfer_data_file_simple_text(transfer_data_file,
                                         transfer_content,
                                         type_value,
                                         with_cve_db=False,
                                         cve='',
                                         title_or_content='c',
                                         db=''):
    append_str = ''
    if with_cve_db:
        append_str = ' ' + cve + ' ' + title_or_content + ' ' + db

    words = nltk.word_tokenize(transfer_content)
    idx = 0
    for ww in words:
        if utils.is_ascii(ww) and (not ww.startswith('www.')):
            if type_value in ['vulnerable_']:
                if ww == 'Access':
                    transfer_data_file.write(ww + ' ' + 'S-' + type_value +
                                             'software' + ' O' + append_str +
                                             '\n')
                else:
                    transfer_data_file.write(ww + ' ' + 'O' + ' O' +
                                             append_str + '\n')
            else:
                transfer_data_file.write(ww + ' ' + 'O' + ' O' + append_str +
                                         '\n')
        idx += 1
Example #4
0
 def __non_alpha_filter(self, text: str) -> (str, dict):
     non_ascii_locs = {}
     result_text_buffer = StringIO()
     for i, c in enumerate(text):
         if not utils.is_ascii(c) or not c.isalpha():
             non_ascii_locs[i] = c
         else:
             result_text_buffer.write(c)
     return result_text_buffer.getvalue(), non_ascii_locs
Example #5
0
def trending_demo(api):
    # Gets worldwide trending topics. We can also specify a region (like USA or California or SF)
    trends = filter(lambda t: is_ascii(t.name), api.GetTrendsCurrent())

    print 'Trending topics'
    print '---------------'

    for trend in trends[:NUM_TRENDING_TOPICS]:
        print '{0} ->\n\tquery = {1}\n\turl = {2}\n\t# tweets = {3}\n'.format(
            trend.name, trend.query, trend.url, trend.tweet_volume)
 def make_one_hot_encoding(self, movie_info):
     encoded = {
         "year": movie_info["year"].tolist(),
         "average_rating": movie_info["average_rating"].tolist()
     }
     row_count = movie_info.shape[0]
     for numeric_column in ["year", "average_rating"]:
         encoded[numeric_column] = movie_info[numeric_column].tolist()
     for index, movie in movie_info.iterrows():
         for list_like_category in ["genres", "cast"]:
             for item in movie[list_like_category]:
                 item = "Unknown" if (item == "" or not utils.is_ascii(item)) else item
                 one_hot_colname = "{}_{}".format(list_like_category, item)
                 if one_hot_colname not in encoded.keys():
                     encoded[one_hot_colname] = [0] * row_count
                 encoded[one_hot_colname][int(index)] = 1
         for single_category in ["producer", "writer", "composer"]:
             item = "Unknown" if (movie[single_category] == "" or not utils.is_ascii(movie[single_category])) else movie[single_category]
             one_hot_colname = "{}_{}".format(single_category, item)
             if one_hot_colname not in encoded.keys():
                 encoded[one_hot_colname] = [0] * row_count
             encoded[one_hot_colname][int(index)] = 1
     return pd.DataFrame(encoded)
Example #7
0
    def post_beacon(self):
        """API used to send captured beacons from LP to Controller"""

        beacon = json_to_beacon(request.data)

        # Check if implant already exists
        implant = db.session.query(Implant).filter_by(
            uuid=beacon['uuid']).first()

        if implant:
            # Update existing implant
            implant.last_beacon_received = datetime.now()
            implant.external_ip_address = beacon['external_ip_address']
            db.session.commit()
        else:
            # Add new implant
            implant = Implant(uuid=beacon['uuid'])
            db.session.add(implant)
            db.session.commit()

        # Store beacon data
        if 'data' in beacon:
            beacon_data = beacon['data']
            if beacon_data:

                datastore = DataStore(
                    implant=[implant], timestamp=datetime.now())

                if is_ascii(beacon_data):
                    datastore.text_received = beacon_data
                else:
                    datastore.data_received = beacon_data

                db.session.add(datastore)
                db.session.commit()

        http_return_code = 200
        response = make_response('Success', http_return_code)
        return response
Example #8
0
    def post_beacon(self):
        """API used to send captured beacons from LP to Controller"""

        beacon = json_to_beacon(request.data)

        # Check if implant already exists
        implant = db.session.query(Implant).filter_by(
            uuid=beacon['uuid']).first()

        if implant:
            # Update existing implant
            implant.last_beacon_received = datetime.now()
            implant.external_ip_address = beacon['external_ip_address']
            db.session.commit()
        else:
            # Add new implant
            implant = Implant(uuid=beacon['uuid'])
            db.session.add(implant)
            db.session.commit()

        # Store beacon data
        if 'data' in beacon:
            beacon_data = beacon['data']
            if beacon_data:

                datastore = DataStore(implant=[implant],
                                      timestamp=datetime.now())

                if is_ascii(beacon_data):
                    datastore.text_received = beacon_data
                else:
                    datastore.data_received = beacon_data

                db.session.add(datastore)
                db.session.commit()

        http_return_code = 200
        response = make_response('Success', http_return_code)
        return response
Example #9
0
def test_is_ascii():
    assert is_ascii('abcd123')
    assert not is_ascii('가나다')
    assert not is_ascii(u'가나다')
Example #10
0
def process_country_gender(params):
    assert 'process_country_gender_countries' in params
    assert 'process_country_gender_cities' in params
    country_dict = {}
    city_dict = {}
    country_id = {}
    with codecs.open(params['process_country_gender_countries'], encoding='utf-8', mode='r') as fin:
        lines = fin.readlines()
        for i in range(1,len(lines)):
            line = lines[i]
            fips,iso,tsd,country = line.strip().split('\t')
            country = country.lower()
            country_dict[country] = i-1
            country_id[fips] = country
    with codecs.open(params['process_country_gender_cities'], encoding='utf-8', mode='r') as fin:
        city_dict_tmp = {}
        lines = fin.readlines()
        for i in range(1,len(lines)):
            line = lines[i]
            if len(line.strip().split('\t')) != 2:
                continue
            fips,city = line.strip().split('\t')
            city = city.lower()
            if city in city_dict_tmp:
                city_dict_tmp[city] = 'NOTACOUNTRY'
            else:
                city_dict_tmp[city] = country_id[fips]

    with codecs.open(params['process_country_gender_output_country_id'], encoding='utf-8', mode='w') as fout:
        countries = country_dict.keys()
        countries = sorted(countries)
        for country in countries:
            fout.write('%s\t%d\n'%(country, country_dict[country]))
    city_dict = {key:city_dict_tmp[key] for key in city_dict_tmp if not city_dict_tmp[key] == 'NOTACOUNTRY'}
    city_dict_opt = {}
    for city in city_dict:
        city_tmp = city_dict_opt
        for i in range(len(city)):
            c = city[i]
            if c not in city_tmp:
                city_tmp[c] = ['N',{}]
            if i == len(city)-1:
                city_tmp[c][0] = country_dict[city_dict[city]]
            city_tmp = city_tmp[c][1]
    city_dict = city_dict_opt

    with codecs.open(params['process_country_gender_input_file'], encoding='utf-8', mode='r') as fin:
        with open(params['process_country_gender_output_file_gender'], 'w') as fgender:
            with open(params['process_country_gender_output_file_country'], 'w') as fcountry:
                line_idx = 0
                male_total = 0
                female_total = 0
                for line in fin:
                    line_idx += 1
                    if line_idx % 10000 == 0:
                        print '%d lines processed' %line_idx
                    id, name, place_of_birth, nationality, male_cnt, female_cnt = line.strip().split('\t')
                    if utils.is_ascii(name):
                        confidence, country = get_place_of_birth(place_of_birth, city_dict, country_dict)
                        if (confidence > 0 and 'process_country_gender_confidence' not in params) or ('process_country_gender_confidence' in params and confidence > params['process_country_gender_confidence']):
                            fcountry.write('%s\t%d\n' %(name, country))
                        else:
                            confidence, country = get_nationality(nationality, city_dict, country_dict)
                            if (confidence > 0 and 'process_country_gender_confidence' not in params) or ('process_country_gender_confidence' in params and confidence > params['process_country_gender_confidence']):
                                fcountry.write('%s\t%d\n' %(name, country))
                        confidence, gender = get_gender(male_cnt, female_cnt)
                        if (confidence > 0 and 'process_country_gender_confidence' not in params) or ('process_country_gender_confidence' in params and confidence > params['process_country_gender_confidence']):
                            fgender.write('%s\t%d\n' %(name, gender))
                            if gender == 1:
                                male_total += 1
                            else:
                                female_total += 1
                print 'Male: %d, Female: %d\n' %(male_total, female_total)
 def normalize_categorical(string):
     return string if (utils.is_ascii(string) and len(string) > 0) else "Unknown"