Ejemplo n.º 1
0
 def test_analyze_bounds(self):
     """
     Testing the bounds of the tweets values
     """
     ana = Analyzer()
     assert ana.analyze("this is a test neutral tweet") <= 1.0
     assert ana.analyze("this is a test neutral tweet") >= 0.0
Ejemplo n.º 2
0
 def test_analyze_bounds(self):
     """
     Testing the bounds of the tweets values
     """
     ana = Analyzer()
     assert ana.analyze("this is a test neutral tweet") <= 1.0
     assert ana.analyze("this is a test neutral tweet") >= 0.0
Ejemplo n.º 3
0
def testall(directory, pred_file=None, label_file=None, out_path=None):
    folders = os.listdir(directory)
    networks = []
    for folder in folders:
        if os.path.isfile(directory+folder+"/network.cfg") and os.path.exists(directory+folder+"/results"):
            networks.append(folder)
    
    config_file = directory+networks[0]+"/network.cfg"
    config = ConfigParser.ConfigParser()
    config.read(config_file)
    
    test_data = LoadData(directory = config.get('Testing Data', 'folders').split(','), 
                         data_file_name = config.get('Testing Data', 'data_file'),
                         label_file_name = config.get('Testing Data', 'label_file'),
                         seg_file_name = config.get('Testing Data', 'seg_file'))
    
    res = Analyzer(raw = test_data.get_data()[0], 
                   target = test_data.get_labels()[0])
                   
    for net in networks:
        config_file = directory+net+"/network.cfg"
        config = ConfigParser.ConfigParser()
        config.read(config_file)
        
        res.add_results(results_folder = config.get('General','directory'),
                        name = net,
                        prediction_file = config.get('Testing', 'prediction_file')+'_0', 
                        learning_curve_file = 'learning_curve')
                           
        res.analyze(-1, pred_file=pred_file, label_file=label_file, out_path=out_path)
        
    return res
Ejemplo n.º 4
0
 def test_analyze_judgement_weight(self):
     """
     Testing the value order
     of arbitrary tweets
     """
     ana = Analyzer()
     assert ana.analyze("i am so happy, great day :D") > ana.analyze("i am so happy :D")
     assert ana.analyze("so sad, feeling depressed :'(") < ana.analyze("so depressed :'(")
Ejemplo n.º 5
0
 def test_analyze_empty(self):
     """
     Testing empty tweets
     and tweets including words not
     in the dictionary
     """
     ana = Analyzer()
     assert ana.analyze("") == 0.5
     assert ana.analyze("hzoehfsdl") == 0.5
Ejemplo n.º 6
0
 def test_analyze_empty(self):
     """
     Testing empty tweets
     and tweets including words not
     in the dictionary
     """
     ana = Analyzer()
     assert ana.analyze("") == 0.5
     assert ana.analyze("hzoehfsdl") == 0.5
Ejemplo n.º 7
0
 def test_analyze_judgement(self):
     """
     Testing the proper judgement of the sentiment analysis:
     * positive and negative
     * best and worse tweet values
     """
     ana = Analyzer()
     assert ana.analyze(":)") > 0.5 and ana.analyze(":'(") < 0.5
     assert ana.analyze("yahoo yahoo yahoo") == 1.0
     assert ana.analyze("zzz zzz zzz zzz zzz") == 0.0
Ejemplo n.º 8
0
 def test_categories_weight(self):
     """
     Testing the weights of the different
     categorie sums (positive, negative, neutral)
     """
     ana = Analyzer()
     ctg_total = {'positive': 0.0, 'negative': 0.0, 'neutral': 0.0}
     ctg_count = {'positive': 4, 'negative': 2, 'neutral': 1}
     data = [2, 3, 0, 2, 2, 0, -4, 0, 0, -2, 2]
     tot_pos, tot_neg, tot_neu = ana.weight_categories(data, ctg_total, ctg_count)
     assert (tot_pos, tot_neg, tot_neu) == (99.47646509317096, -49.392885301738836, 3.9750077625545726)
Ejemplo n.º 9
0
 def test_categories_cardinality(self):
     """
     Testing the cardinality of the different
     categorie sums (positive, negative, neutral)
     """
     ana = Analyzer()
     ctg_count = {'positive': 0, 'negative': 0, 'neutral': 0}
     text = 'great day today lol ;) but still have to work'
     assert ana.categories_cardinality(text, ctg_count) == 15
     assert ctg_count['positive'] == 4  # great day lol ;)
     assert ctg_count['neutral'] == 1  # today
     assert ctg_count['negative'] == 2  # work still
Ejemplo n.º 10
0
 def test_categories_cardinality(self):
     """
     Testing the cardinality of the different
     categorie sums (positive, negative, neutral)
     """
     ana = Analyzer()
     ctg_count = {'positive': 0, 'negative': 0, 'neutral': 0}
     text = 'great day today lol ;) but still have to work'
     assert ana.categories_cardinality(text, ctg_count) == 15
     assert ctg_count['positive'] == 4  # great day lol ;)
     assert ctg_count['neutral'] == 1  # today
     assert ctg_count['negative'] == 2  # work still
Ejemplo n.º 11
0
 def test_categories_weight(self):
     """
     Testing the weights of the different
     categorie sums (positive, negative, neutral)
     """
     ana = Analyzer()
     ctg_total = {'positive': 0.0, 'negative': 0.0, 'neutral': 0.0}
     ctg_count = {'positive': 4, 'negative': 2, 'neutral': 1}
     data = [2, 3, 0, 2, 2, 0, -4, 0, 0, -2, 2]
     tot_pos, tot_neg, tot_neu = ana.weight_categories(
         data, ctg_total, ctg_count)
     assert (tot_pos, tot_neg,
             tot_neu) == (99.47646509317096, -49.392885301738836,
                          3.9750077625545726)
Ejemplo n.º 12
0
def testprediction(config_file, pred_file=None, label_file=None, out_path=None):     
    config = ConfigParser.ConfigParser()
    config.read(config_file)
    
    test_data = LoadData(directory = config.get('Testing Data', 'folders').split(','), 
                         data_file_name = config.get('Testing Data', 'data_file'),
                         label_file_name = config.get('Testing Data', 'label_file'), 
                         seg_file_name = config.get('Testing Data', 'seg_file'))
    
    res = Analyzer(raw = test_data.get_data()[0],
                   target = test_data.get_labels()[0])
    res.add_results(results_folder = config.get('General','directory'),
                    name = config_file.split('/')[-3],
                    prediction_file = config.get('Testing', 'prediction_file')+'_0', 
                    learning_curve_file = 'learning_curve')           
    res.analyze(-1, pred_file=pred_file, label_file=label_file, out_path=out_path)
    
    return res
Ejemplo n.º 13
0
def analyze(chart_ids: List[str] = [],
            src: str = CHART_PATH,
            dest: str = default_excel_path):
    """
        Analyzes charts given a list of IDs. If you want to analyze all levels
        in src, don't input any IDs.
    """
    if len(chart_ids) == 0:
        with os.scandir(src) as dir_items:
            chart_ids = [
                cid.name for cid in dir_items if is_chart_folder(cid.path)
            ]

    if len(chart_ids) == 0:
        click.echo("No charts in the folder!")

    stat_list = dict()
    src = os.path.abspath(src)
    dest = os.path.abspath(dest)
    os.makedirs(os.path.dirname(dest), exist_ok=True)

    with click.progressbar(chart_ids,
                           label=f"Analyzing {len(chart_ids)} charts...",
                           item_show_func=lambda x: x) as prog_bar:
        for chart_id in prog_bar:
            analyzer = Analyzer(src, chart_id)
            analyzer.start()
            stats = analyzer.get_stats_as_json()
            stat_list[chart_id] = stats

    click.echo(f"Done analyzing, now saving to {dest}...")
    dest_folder = os.path.dirname(dest)
    os.makedirs(dest_folder, exist_ok=True)

    stat_df = pd.DataFrame.from_dict(stat_list, orient="index")
    stat_df.index.name = "chart_id"

    excel_writer = ExcelWriter(stat_df, dest)
    excel_writer.format_table()
    excel_writer.close()

    click.echo("Stats successfully saved.")
Ejemplo n.º 14
0
def home():
    if request.method == 'GET':
        # render homepage template
        return render_template('boot.html')
    else:
        # grab POST form data
        data = request.form

        # parse as JSON
        jsondata = json.dumps(data, separators=(',', ':'))
        if 'topic' in jsondata:
            # load data into dictionary
            new_data = json.loads(jsondata)

            # create random number for this graph
            new_data['rand'] = str(int(random.random() * 999999999))

            # connect to twitter
            auth = OAuthHandler(CONSUMER_KEY, CONSUMER_SECRET)
            auth.set_access_token(ACCESS_TOKEN, ACCESS_TOKEN_SECRET)
            api = tweepy.API(auth)

            # get the tweets
            tweets = stream.gather_tweets(
                api, auth, keyword=new_data['topic'][0],
                limit=int(new_data['limit'][0]))

            # Create analyzer
            analyzer = Analyzer(tweets, new_data['topic'][0])
            analyzer.save_sentiment_data(int(new_data['rand']))

            # render results page
            return redirect((url_for('log', data=json.dumps(new_data),
                                     mode='debug')))
        elif 'username' in jsondata:
            # load data into dictionary
            new_data = json.loads(jsondata)

            # create random number for this graph
            new_data['rand'] = str(int(random.random() * 999999999))

            # connect to twitter
            auth = OAuthHandler(CONSUMER_KEY, CONSUMER_SECRET)
            auth.set_access_token(ACCESS_TOKEN, ACCESS_TOKEN_SECRET)
            api = tweepy.API(auth)

            # get the tweets
            tweets = stream.gather_tweets(
                api, auth, username=new_data['username'][0], limit=50)

            # Create analyzer
            analyzer = Analyzer(tweets, new_data['username'][0])

            analyzer.save_sentiment_data(int(new_data['rand']))
            return redirect((url_for('log', data=json.dumps(new_data),
                                     mode='debug')))
Ejemplo n.º 15
0
 def test_analyze_judgement(self):
     """
     Testing the proper judgement of the sentiment analysis:
     * positive and negative
     * best and worse tweet values
     """
     ana = Analyzer()
     assert ana.analyze(":)") > 0.5 and ana.analyze(":'(") < 0.5
     assert ana.analyze("yahoo yahoo yahoo") == 1.0
     assert ana.analyze("zzz zzz zzz zzz zzz") == 0.0
Ejemplo n.º 16
0
 def test_analyze_judgement_weight(self):
     """
     Testing the value order
     of arbitrary tweets
     """
     ana = Analyzer()
     assert ana.analyze("i am so happy, great day :D") > ana.analyze(
         "i am so happy :D")
     assert ana.analyze("so sad, feeling depressed :'(") < ana.analyze(
         "so depressed :'(")
Ejemplo n.º 17
0
def ViewResults(**kwargs):
    directory = kwargs.get("directory", "")
    network = kwargs.get("network", None)
    prediction_file = kwargs.get("predictions_file", None)

    if network:
        # Assume that all networks are tested on the same set of data
        config = ConfigParser.ConfigParser()
        config.read("networks/" + network + "/network.cfg")
        data = LoadData(
            directory=config.get("Testing Data", "folders").split(",")[0],
            data_file_name=config.get("Testing Data", "data_file"),
            label_file_name=config.get("Testing Data", "label_file"),
        )

        if not prediction_file:
            prediction_file = "test_prediction_0"

        results = Analyzer(target=data.get_labels()[0], raw=data.get_data()[0])
        results.add_results(results_folder="networks/" + network + "/", name=network, prediction_file=prediction_file)

    else:
        folders = os.listdir(directory)
        networks = []
        for folder in folders:
            if os.path.isfile(directory + folder + "/network.cfg"):
                networks.append(folder)

        # Assume that all networks are tested on the same set of data
        config = ConfigParser.ConfigParser()
        config.read(directory + networks[0] + "/network.cfg")
        data = LoadData(
            directory=config.get("Testing Data", "folders").split(",")[0],
            data_file_name=config.get("Testing Data", "data_file"),
            label_file_name=config.get("Testing Data", "label_file"),
        )

        if not prediction_file:
            prediction_file = "test_prediction_0"

        results = Analyzer(target=data.get_labels()[0], raw=data.get_data()[0])
        for net in networks:
            results.add_results(results_folder=directory + net + "/", name=net, prediction_file=prediction_file)

    return results
Ejemplo n.º 18
0
h = TweetLoader('', path='data/backup/', filename='hillary_2016-07-13.json')
t = TweetLoader('', path='data/backup/', filename='trump_2016-07-13.json')
h.load()
t.load()

# Join them together
full_tweets = pd.concat([h.tweets, t.tweets])

# Assign label (second array) for Hillary(0)/Trump(1) tweets
label_array = np.array([0] * len(h.tweets) + [1] * len(t.tweets))

# Run through part of the model to get the PCA results and loading factors
# This is not the full model, just a part of it for illustration purposes
max_words = 50
mod = Analyzer(full_tweets['text'],
               labels=label_array,
               max_words=max_words,
               load_pca=False)

# mod.load_words()
mod.get_words()
mod.create_dtm()
mod.run_pca()

loadings = mod.loadings
loadings.index = ['PC' + str(j + 1) for j in range(len(loadings))]

# loadings = loadings.iloc[0:30, :]  # Use only a subset of the data
loadings = loadings.transpose()  # Use rotation

words = loadings.columns.tolist()
pc_names = loadings.index.tolist()
# t = TweetLoader('realDonaldTrump')
h = TweetLoader('', path='data/backup/', filename='hillary_2016-07-13.json')
t = TweetLoader('', path='data/backup/', filename='trump_2016-07-13.json')
h.load()
t.load()

# Join them together
full_tweets = pd.concat([h.tweets, t.tweets])

# Assign label (second array) for Hillary(0)/Trump(1) tweets
label_array = np.array([0]*len(h.tweets) + [1]*len(t.tweets))

# Run through part of the model to get the PCA results and loading factors
# This is not the full model, just a part of it for illustration purposes
max_words = 50
mod = Analyzer(full_tweets['text'], labels=label_array, max_words=max_words, load_pca=False)

# mod.load_words()
mod.get_words()
mod.create_dtm()
mod.run_pca()

loadings = mod.loadings
loadings.index = ['PC'+str(j+1) for j in range(len(loadings))]

# loadings = loadings.iloc[0:30, :]  # Use only a subset of the data
loadings = loadings.transpose()  # Use rotation

words = loadings.columns.tolist()
pc_names = loadings.index.tolist()
Ejemplo n.º 20
0
import matplotlib.pyplot as plt

# Load tweets
s2 = TweetLoader(filename='coolstars.json',
                 track_location=False,
                 path='coolstars19/data/')
s2.load()

df = s2.tweets.copy()
df.index = pd.DatetimeIndex(df['created_at'])

# Using the Analyzer class
max_words = 100
mod = Analyzer(df['text'],
               None,
               max_words=max_words,
               load_pca=False,
               load_svm=False,
               more_stop_words=['rt', 'cs19', 'cs19_uppsala'])

mod.get_words()
mod.create_dtm()
mod.run_pca()

# Exploration
print_dtm(mod.dtm, df['text'], 42)

# Top terms in components
top_factors(mod.load_squared, 0)

# Plots
make_biplot(mod.pcscores, None, mod.loadings, 0, 1)
Ejemplo n.º 21
0
    print("Unknown model " + args.model + ".\n")
    exit()
# Send model weights to the device
model.to(args.device)
print(model)

#%%
"""
###################
Initialize model and analyzer save
###################
"""
# Apply weight initialization
model.apply(initializer)
# Create an analyzer object
analyzer = Analyzer(args)

#%%
"""
###################
Create optimizer
###################
"""
# Optimizer and Loss
optimizer = torch.optim.Adam(model.parameters(), lr=args.lr, weight_decay=1e-4)
scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer,
                                                       factor=0.5,
                                                       threshold=1e-6)
# Use cross-entropy loss
if args.model in ['ae', 'vae', 'wae', 'vae_flow']:
    criterion = nn.L1Loss()
Ejemplo n.º 22
0
def main(argv=None):
    #read in params
    if argv is None:
        argv = sys.argv[1:]
    
    file = 'tulalens_survey_sample.csv'
    facet = 'result id'
    
    #standard python parsing for command line options
    opts = []
    args = []

    try:
        opts, args = getopt.getopt(argv, "hl", ["help", "list", "file=", "facet="])
    except getopt.GetoptError as msg:
        print(sys.stderr, msg)
        print >>sys.stderr, "For help use --help"
        return 2
    
    if len(args):
        print >>sys.stderr, "Invalid arg(s) %s"%args
        usage()
        return 2

    for (opt, val) in opts:
        if opt in ("-h", "--help"):
            usage()
            return 0
        if opt in ("-l", "--list"):
            list()
            return 0
        elif opt in ("--file"):
            file = val
        elif opt in ("--facet"):
            facet = val.lower()
        else:
            usage()
            return 2
    
    print("facet: %s" % facet)
    #check if facet given is in the list of survey questions
    #ideally this allows for quick entries with just the 
    #question number, e.g. "--facet Q30"
    long_q = '' #keep track of the long form for later use
    valid_facet = False
    for long, short in SHORT_QUESTIONS.items():
        #print("checking question: %s" % question)
        if facet in long:
            #turn the facet into easy to use question ids
            #p = "(^q\d\d?[.]).*"
            #m = re.match(p, long)
            facet = short
            long_q = long
            print("Question selected: %s" % long_q)
            valid_facet = True
            break
    
    if not valid_facet:
        sys.exit("facet selected is not a survey question")
            
    #parse csv file
    parser = CsvParse(file)
    answers = parser.parse()
    
    #generate analysis based on options
    #print("number of answer rows after parse: %s" % len(answers))
    
    analyze = Analyzer(answers)
    #find the unique occurrence of each answer to the question
    answers_count = analyze.group_by(facet)
    
    mean = analyze.find_mean(facet, answers_count)
        
    sys.exit()
Ejemplo n.º 23
0
from analysis import Analyzer
from helper_functions import print_dtm, top_factors, make_biplot
import pandas as pd
import matplotlib.pyplot as plt


# Load tweets
s2 = TweetLoader(filename='coolstars.json', track_location=False, path='coolstars19/data/')
s2.load()

df = s2.tweets.copy()
df.index = pd.DatetimeIndex(df['created_at'])

# Using the Analyzer class
max_words = 100
mod = Analyzer(df['text'], None, max_words=max_words, load_pca=False, load_svm=False,
               more_stop_words=['rt', 'cs19', 'cs19_uppsala'])

mod.get_words()
mod.create_dtm()
mod.run_pca()

# Exploration
print_dtm(mod.dtm, df['text'], 42)

# Top terms in components
top_factors(mod.load_squared, 0)

# Plots
make_biplot(mod.pcscores, None, mod.loadings, 0, 1)

Ejemplo n.º 24
0
    404 error handler
    used if a non existant route
    is requested
    """
    return render_template('404.html'), 404


@app.errorhandler(500)
def page_not_found(exc):
    """
    500 error handler
    used if there is a server error
    """
    return render_template('500.html'), 500


if __name__ == '__main__':
    analyzer = Analyzer()
    server = SocketIOServer(('', PORT), app, resource="socket.io")
    tw_thread = TweetWeather(server, analyzer, name="Tweet-Weather-Thread")
    tw_thread.daemon = True
    gevent.spawn(tw_thread.new_post, server)
    gevent.spawn(tw_thread.connexion_lost, server)
    print "Application Started: http://localhost:5000"
    try:
        server.serve_forever()
    except KeyboardInterrupt:
        tw_thread.stop()
        server.stop()
        sys.exit()
# Merge tweets together, pass to Analyzer
df_tweets = pd.concat([h.tweets['text'], t.tweets['text']],
                      axis=0,
                      join='outer',
                      join_axes=None,
                      ignore_index=True,
                      keys=None,
                      levels=None,
                      names=None,
                      verify_integrity=False)

# Using the Analyzer class
mod = Analyzer(df_tweets,
               label_array,
               max_words=max_words,
               load_pca=False,
               load_svm=False,
               use_sentiment=True)

# mod.get_words()
# mod.create_dtm()
# mod.run_pca()
# mod.get_sentiment()
# test_predict, test_label = mod.run_svm()

# One-line alternative with defaults
test_predict, test_label = mod.create_full_model()

# Check a PCA plot
# mod.make_biplot(2, 3, max_arrow=0.2)
Ejemplo n.º 26
0
    def __init__(self, market):
        super(MarketThread, self).__init__()
        self.market = market

    def run(self):
        while not self._stop.isSet():
            time.sleep(settings.HEARTBEAT)
            self.market.update()


if __name__ == "__main__":
    q = Queue.Queue()

    p = Portfolio(20000)
    e = Executor(p)
    a = Analyzer(portfolio=p)
    m = Market(queue=q)

    trading_thread = TradingThread(queue=q, analyzer=a, events=e)
    market_thread = MarketThread(market=m)

    def receive_signal(signum, stack):
        print("You quit")
        trading_thread.stop()
        market_thread.stop()
        sys.exit(0)

    market_thread.start()
    trading_thread.start()
    signal.signal(signal.SIGINT, receive_signal)
Ejemplo n.º 27
0
                print(tweet.text.encode('utf-8'))
                tweets.append(tweet._json)
                count += 1
                if count > limit:
                    break
        return tweets
    elif keyword:
        l = StdOutListener()
        stream = Stream(auth, l)
        global lim
        lim = limit
        stream.filter(track=[keyword])
        with open('tweet_stream.pickle', 'rb') as f:
            return pickle.load(f)
    else:
        raise ValueError('Invalid Arguments. username and keyword both' +
                         'can\'t be None')

if __name__ == '__main__':
    s = 'baltimore'
    auth = OAuthHandler(CONSUMER_KEY, CONSUMER_SECRET)
    auth.set_access_token(ACCESS_TOKEN, ACCESS_TOKEN_SECRET)
    api = tweepy.API(auth)
    # tweets = gather_tweets(username=s) # last 30 tweets
    tweets = gather_tweets(keyword=s, limit=30)
    # Create analyzer
    analyzer = Analyzer(tweets, s)
    avg = analyzer.calc_sentiment()
    #keywrds = analyzer.get_keywords()
    analyzer.save_sentiment_data()
Ejemplo n.º 28
0
    # timer
    STOP = time.time()

    print(f"\t-----> Done.")
    print(f"\t-----> Execution time: {round(STOP-START, 2)} sec")


if __name__ == "__main__":
    app_settings = {
        'client_id': os.getenv('SPOTIFY_CLIENT_ID'),
        'client_secret': os.getenv('SPOTIFY_CLIENT_SECRET'),
        'redirect_uri': os.getenv('SPOTIFY_REDIRECT_URI')
    }

    # init analyzer
    az = Analyzer(**app_settings)

    # get tracks and simulate lengths
    # get all playlists
    playlists = az.user_playlists(is_author=True)

    start = time.time()
    print("-----> Gathering all tracks...", end="")
    # get all tracks
    all_tracks = []
    for playlist in playlists:
        tracks = az.playlist_tracks(playlist['id'])
        # append the playlist meta data
        # to the track objects
        for i in range(len(tracks)):
            tracks[i]['playlist'] = playlist
# h = TweetLoader('HillaryClinton')
# t = TweetLoader('realDonaldTrump')
h = TweetLoader('', path='data/backup/', filename='hillary_2016-07-13.json')
t = TweetLoader('', path='data/backup/', filename='trump_2016-07-13.json')
h.load()
t.load()

# Assign label (second array) for Hillary(0)/Trump(1) tweets
label_array = np.array([0]*len(h.tweets) + [1]*len(t.tweets))

# Merge tweets together, pass to Analyzer
df_tweets = pd.concat([h.tweets['text'], t.tweets['text']], axis=0, join='outer', join_axes=None,
                      ignore_index=True, keys=None, levels=None, names=None, verify_integrity=False)

# Using the Analyzer class
mod = Analyzer(df_tweets, label_array, max_words=max_words, load_pca=False, load_svm=False, use_sentiment=True)

# mod.get_words()
# mod.create_dtm()
# mod.run_pca()
# mod.get_sentiment()
# test_predict, test_label = mod.run_svm()

# One-line alternative with defaults
test_predict, test_label = mod.create_full_model()

# Check a PCA plot
# mod.make_biplot(2, 3, max_arrow=0.2)

# Check results
cm = mod.make_confusion_matrix(test_label, test_predict, normalize=False, axis=0, label_names=['Clinton', 'Trump'])
Ejemplo n.º 30
0
    "pem_name",
    help=
    "Name of the PEM file that is needed to connect to the data collection servers."
)
parser.add_argument(
    "database_ip",
    help="IP of the Postgres database that the results will be put into.")
parser.add_argument("data_collector_ips",
                    nargs='+',
                    help="List of IPs of the data collection servers.")

args = parser.parse_args()

ec = External_Connector(args.pem_name, args.database_ip)

# Create list of local files, first is twitter data, rest is news data
files = [
    "%s%d.txt" % (args.type, i) for i in range(0, len(args.data_collector_ips))
]

ec.get_data_files(args.data_collector_ips, files)

a = Analyzer()

# Run three analyses for each data file and upload them to database
for f in files:
    sentiment, mood, emoticon = a.run(args.type, f)
    ec.insert_sentiment(args.run_id, args.type, sentiment)
    ec.insert_mood(args.run_id, args.type, mood)
    ec.insert_emoticon(args.run_id, args.type, emoticon)
Ejemplo n.º 31
0
import pandas as pd
import numpy as np

# Some global defaults
max_words = 200

# Load most recent tweets from Hillary Clinton and Donald Trump
# s = TweetLoader(filename='search.json', track_location=True)
s = TweetLoader(filename='search_2016-07-13.json',
                track_location=True,
                path='data/backup/')
s.load()

# Calculate and grab model results
mod = Analyzer(s.tweets['text'],
               max_words=max_words,
               load_pca=True,
               load_svm=True)
predict = mod.load_full_model()  # Hillary=0  Trump=1
s.tweets['predict'] = predict

# Clean up missing coordinates
df = s.tweets['geo.coordinates']
bad = df.apply(lambda x: x is None)
df = df[~bad]
s.tweets = s.tweets[~bad]

lat = df.apply(lambda x: x[0])
lon = df.apply(lambda x: x[1])
# lat, lon = zip(*df)  # Alternate

# Remove Alaska and Hawaii
Ejemplo n.º 32
0
def main(argv=None):
    #read in params
    if argv is None:
        argv = sys.argv[1:]

    file = 'tulalens_survey_sample.csv'
    facet = 'result id'

    #standard python parsing for command line options
    opts = []
    args = []

    try:
        opts, args = getopt.getopt(argv, "hl",
                                   ["help", "list", "file=", "facet="])
    except getopt.GetoptError as msg:
        print(sys.stderr, msg)
        print >> sys.stderr, "For help use --help"
        return 2

    if len(args):
        print >> sys.stderr, "Invalid arg(s) %s" % args
        usage()
        return 2

    for (opt, val) in opts:
        if opt in ("-h", "--help"):
            usage()
            return 0
        if opt in ("-l", "--list"):
            list()
            return 0
        elif opt in ("--file"):
            file = val
        elif opt in ("--facet"):
            facet = val.lower()
        else:
            usage()
            return 2

    print("facet: %s" % facet)
    #check if facet given is in the list of survey questions
    #ideally this allows for quick entries with just the
    #question number, e.g. "--facet Q30"
    long_q = ''  #keep track of the long form for later use
    valid_facet = False
    for long, short in SHORT_QUESTIONS.items():
        #print("checking question: %s" % question)
        if facet in long:
            #turn the facet into easy to use question ids
            #p = "(^q\d\d?[.]).*"
            #m = re.match(p, long)
            facet = short
            long_q = long
            print("Question selected: %s" % long_q)
            valid_facet = True
            break

    if not valid_facet:
        sys.exit("facet selected is not a survey question")

    #parse csv file
    parser = CsvParse(file)
    answers = parser.parse()

    #generate analysis based on options
    #print("number of answer rows after parse: %s" % len(answers))

    analyze = Analyzer(answers)
    #find the unique occurrence of each answer to the question
    answers_count = analyze.group_by(facet)

    mean = analyze.find_mean(facet, answers_count)

    sys.exit()
from bokeh.sampledata.us_states import data as states
from bokeh.models import ColumnDataSource, HoverTool
import reverse_geocoder as rg
import pandas as pd
import numpy as np

# Some global defaults
max_words = 200

# Load most recent tweets from Hillary Clinton and Donald Trump
# s = TweetLoader(filename='search.json', track_location=True)
s = TweetLoader(filename='search_2016-07-13.json', track_location=True, path='data/backup/')
s.load()

# Calculate and grab model results
mod = Analyzer(s.tweets['text'], max_words=max_words, load_pca=True, load_svm=True)
predict = mod.load_full_model()  # Hillary=0  Trump=1
s.tweets['predict'] = predict

# Clean up missing coordinates
df = s.tweets['geo.coordinates']
bad = df.apply(lambda x: x is None)
df = df[~bad]
s.tweets = s.tweets[~bad]

lat = df.apply(lambda x: x[0])
lon = df.apply(lambda x: x[1])
# lat, lon = zip(*df)  # Alternate

# Remove Alaska and Hawaii
del states["HI"]
Ejemplo n.º 34
0
___author__ = 'Ahmed Hani Ibrahim'
from reader import DataReader
from analysis import Analyzer

file_path = './data/data_science_dataset_wuzzuf.csv'

reader = DataReader(file_path)
data = reader.read_data()
analyzer = Analyzer(data)

analyzer.trending_category()

x = 0
Ejemplo n.º 35
0
import matplotlib.pyplot as plt
import seaborn as sns

sys.path.insert(1, "../tools")

from analysis import Analyzer
from plotting import Plotter
from training import Trainer

if __name__ == "__main__":
    sns.set()
    plot_dir = "plots"
    plot_file = os.path.join(plot_dir, "rdf.png")
    if not os.path.exists(plot_dir):
        os.mkdir(plot_dir)

    anl = Analyzer()
    plter = Plotter()
    r_cut = 6.0
    r, rdf = anl.calculate_rdf("trajs/training.traj", r_max=r_cut)
    rdf[np.nonzero(rdf)] /= max(rdf)
    cutoff = plter.polynomial(r, r_cut, gamma=5.0)

    plt.plot(r, rdf, label="Radial distribution function")
    plt.plot(r, cutoff, label="Polynomial cutoff, gamma=5.0")
    plt.legend()
    plt.title("Copper radial distribution function")
    plt.xlabel("Radial distance [Angstrom]")
    plt.ylabel("Radial distribution function (normalized to 1)")
    plt.savefig(plot_file)
Ejemplo n.º 36
0
    trn.create_Gs(elements, num_radial_etas, num_angular_etas, num_zetas,
                  angular_type)
    symm_funcs["Selected"] = trn.Gs

    G2 = make_symmetry_functions(elements=elements,
                                 type="G2",
                                 etas=[0.05, 0.23, 1.0, 5.0],
                                 centers=np.zeros(4))
    G4 = make_symmetry_functions(
        elements=elements,
        type="G4",
        etas=0.005 * np.ones(1),
        zetas=[1.0, 4.0],
        gammas=[1.0, -1.0],
    )
    symm_funcs["Default"] = G2 + G4

    anl = Analyzer()
    plter = Plotter()
    r, rdf = anl.calculate_rdf(train_traj, r_max=cutoff.Rc)

    for label, symm_func in symm_funcs.items():
        plter.plot_symmetry_functions(
            label + "_rad.png",
            label + "_ang.png",
            symm_func,
            rij=r,
            rdf=rdf,
            cutoff=cutoff,
        )
Ejemplo n.º 37
0
    def analyze(self, expr):
        name = "f"

        self.analyzer = Analyzer(expr)
        self.function_view.set_from_expression(expr, name=name + "(x)")
        self.function_view.set_font_size(40)

        box = ListBox("Dominio")
        self.box.pack_start(box, False, False, 0)

        domain_block = EqualBlock(TextBlock("D(f)"), TextBlock(interval_to_string(self.analyzer.domain)))
        box.make_row_with_child(domain_block)

        box = ListBox("Raíces")
        self.box.pack_start(box, False, False, 0)

        roots_block = TextBlock(set_to_string(self.analyzer.roots.keys()))
        box.make_row_with_child(roots_block)

        box = ListBox("Signo")
        self.box.pack_start(box, False, False, 0)

        if self.analyzer.positive.__class__ != sympy.EmptySet:
            positive_block = TextBlock("+  " + interval_to_string(self.analyzer.positive))
            box.make_row_with_child(positive_block)

        if self.analyzer.negative.__class__ != sympy.EmptySet:
            negative_block = TextBlock("-  " + interval_to_string(self.analyzer.negative))
            box.make_row_with_child(negative_block)

        box = ListBox("Continuidad")
        self.box.pack_start(box, False, False, 0)

        if self.analyzer.continuity == self.analyzer.domain:
            block = TextBlock("f es continua en todo su dominio.")

        else:
            block = TextBlock("f es continua para los x %s %s\n" % (Chars.BELONGS, interval_to_string(self.analyzer.continuity)))

        box.make_row_with_child(block)

        box = ListBox("Ramas")
        self.box.pack_start(box, False, False, 0)

        if self.analyzer.branches[sympy.oo] is not None:
            block = TextBlock("f posee %s cuando" % Branch.get_name(*self.analyzer.branches[sympy.oo]))
            row = box.make_row_with_child(block)
            trend_block = TrendBlock(TextBlock("x"), TextBlock("+" + Chars.INFINITY))
            trend_block.set_margin_left(10)
            row.add_child(trend_block)

        if self.analyzer.branches[-sympy.oo] is not None:
            block = TextBlock("f posee %s cuando" % Branch.get_name(*self.analyzer.branches[-sympy.oo]))
            row = box.make_row_with_child(block)
            trend_block = TrendBlock(TextBlock("x"), TextBlock("-" + Chars.INFINITY))
            trend_block.set_margin_left(10)
            row.add_child(trend_block)

        box = ListBox("Crecimiento")
        self.box.pack_start(box, False, False, 0)

        block = MathView.new_from_expression(self.analyzer.derived, name + "'(x)")
        box.make_row_with_child(block)

        if self.analyzer.derived_things.negative.__class__ != sympy.EmptySet:
            block = TextBlock(name + " decrece en ")
            row = box.make_row_with_child(block)
            row.add_child(make_interval_points(self.analyzer.derived_things.negative))

        if self.analyzer.derived_things.positive.__class__ != sympy.EmptySet:
            block = TextBlock(name + " crece en ")
            row = box.make_row_with_child(block)
            row.add_child(make_interval_points(self.analyzer.derived_things.positive))

        mins, maxs = self.analyzer.get_minimums_and_maximums()

        if mins:
            block = TextBlock("Mínimos: ")
            row = box.make_row_with_child(block)

            for point in mins:
                _x = MathView.new_from_expression(point[0])
                _y = MathView.new_from_expression(point[1])
                block = PointBlock(_x, _y)
                row.add_child(block)

        if maxs:
            block = TextBlock("Máximos: ")
            row = box.make_row_with_child(block)

            for point in maxs:
                _x = MathView.new_from_expression(point[0])
                _y = MathView.new_from_expression(point[1])
                block = PointBlock(_x, _y)
                row.add_child(block)

        box = ListBox("Concavidad")
        self.box.pack_start(box, False, False, 0)

        block = MathView.new_from_expression(self.analyzer.derived2, name + "''(x)")
        box.make_row_with_child(block)

        if self.analyzer.derived2_things.positive.__class__ != sympy.EmptySet:
            block = TextBlock("f tiene concavidad positiva en: ")
            row = box.make_row_with_child(block)
            row.add_child(make_interval_points(self.analyzer.derived2_things.positive))

        if self.analyzer.derived2_things.negative.__class__ != sympy.EmptySet:
            block = TextBlock("f tiene concavidad negativa en: ")
            row = box.make_row_with_child(block)
            row.add_child(make_interval_points(self.analyzer.derived2_things.negative))

        _analyzer = Analyzer(self.analyzer.derived)
        mins, maxs = _analyzer.get_minimums_and_maximums()
        inflection_points = mins + maxs

        if inflection_points:
            block = TextBlock("Puntos de inflexión: ")
            row = box.make_row_with_child(block)

            for point in inflection_points:
                _x = MathView.new_from_expression(point[0])
                _y = MathView.new_from_expression(point[1])
                block = PointBlock(_x, _y)
                block.set_margin_right(10)
                row.add_child(block)

        self.show_all()
Ejemplo n.º 38
0
    test_traj = "test.traj"
    steps, test_traj = trjbd.integrate_atoms(
        test_atoms, test_traj, n_test, save_interval, timestep=timestep, convert=True
    )

    amp_test_traj = "amp_test.traj"
    steps, amp_test_traj = trjbd.integrate_atoms(
        amp_test_atoms,
        amp_test_traj,
        n_test,
        save_interval,
        timestep=timestep,
        convert=True,
    )

    anl = Analyzer()
    r, rdf = anl.calculate_rdf(test_traj, r_max=6.0)
    r_amp, rdf_amp = anl.calculate_rdf(amp_test_traj, r_max=6.0)
    rdf_plot = system + "_" + "rdf.png"
    plter.plot_rdf(rdf_plot, legend, r, rdf, rdf_amp)

    steps, energy_exact, energy_amp = anl.calculate_pot_energy_diff(
        test_traj, amp_test_traj, save_interval=save_interval
    )
    pot_plot = system + "_" + "pot.png"
    plter.plot_pot_energy_diff(pot_plot, legend, steps, energy_exact, energy_amp)

    steps, energy_exact, energy_amp = anl.calculate_energy_diff(
        test_traj, amp_test_traj, save_interval=save_interval
    )
    energy_plot = system + "_" + "energy.png"
# h = TweetLoader('HillaryClinton')
# t = TweetLoader('realDonaldTrump')
h = TweetLoader('', path='data/backup/', filename='hillary_2016-07-13.json')
t = TweetLoader('', path='data/backup/', filename='trump_2016-07-13.json')
h.load()
t.load()


# Assign label (second array) for Hillary(0)/Trump(1) tweets
label_array = np.array([0]*len(h.tweets) + [1]*len(t.tweets))

df_tweets = pd.concat([h.tweets['text'], t.tweets['text']], axis=0, join='outer', join_axes=None,
                      ignore_index=True, keys=None, levels=None, names=None, verify_integrity=False)

# Using the Analyzer class to get sentiments
mod = Analyzer(df_tweets, label_array)
mod.get_sentiment()

# Group together tweets, labels, and sentiments
temp = pd.concat([h.tweets, t.tweets], axis=0, join='outer', join_axes=None, ignore_index=True, levels=None)
df = pd.concat([temp, mod.sentiment, pd.DataFrame({'label': label_array})], axis=1, levels=None)


# Get Tweet text and URLs for embedding: https://twitter.com/{user}/status/{id}
def print_and_get_url(tweet):
    print tweet['text'].values[0]
    print 'https://twitter.com/{}/status/{}'.format(tweet['user.screen_name'].values[0], tweet['id'].values[0])

# Most positive and negative tweet
print_and_get_url(df.sort_values(by='positive', ascending=False)[df['label'] == 0])
print_and_get_url(df.sort_values(by='positive', ascending=False)[df['label'] == 1])
Ejemplo n.º 40
0
# Assign label (second array) for Hillary(0)/Trump(1) tweets
label_array = np.array([0] * len(h.tweets) + [1] * len(t.tweets))

df_tweets = pd.concat([h.tweets['text'], t.tweets['text']],
                      axis=0,
                      join='outer',
                      join_axes=None,
                      ignore_index=True,
                      keys=None,
                      levels=None,
                      names=None,
                      verify_integrity=False)

# Using the Analyzer class to get sentiments
mod = Analyzer(df_tweets, label_array)
mod.get_sentiment()

# Group together tweets, labels, and sentiments
temp = pd.concat([h.tweets, t.tweets],
                 axis=0,
                 join='outer',
                 join_axes=None,
                 ignore_index=True,
                 levels=None)
df = pd.concat([temp, mod.sentiment,
                pd.DataFrame({'label': label_array})],
               axis=1,
               levels=None)