Beispiel #1
0
    def setup_method(self):
        self.keywords = {
            'bitcoin': ['bitcoin', 'BTC'],
            'dashcoin': ['dashcoin', 'DASH', 'darkcoin'],
            'dogecoin': ['dogecoin', 'DOGE'],
            'ethereum': ['ethereum', 'ETH'],
            'litecoin': ['litecoin', 'LTC'],
            'ripple': ['ripple', 'XRP'],
            'monero': ['monero', 'XMR'],
            'stellar': ['stellar', 'STR']
        }
        self.keywordsOnly = [
            value for key, values in self.keywords.items() for value in values
        ]

        _, self.currRoot_dir = get_locations()
        self.logger = get_logger(self.currRoot_dir +
                                 '/livescraper/tests/live.log')
        self.listener = MyStreamListener(self.keywords,
                                         self.logger,
                                         tweetCount=10)

        consumer_key, consumer_secret, access_token, access_token_secret = get_twitter(
        )

        auth = OAuthHandler(consumer_key, consumer_secret)
        auth.set_access_token(access_token, access_token_secret)

        self.myStream = Stream(auth=auth, listener=self.listener)

        self.myStream.filter(track=self.keywordsOnly, languages=['en'])
        self.df, self.userData, _ = self.listener.get_data()
Beispiel #2
0
    def __init__(self, keywords, historicList, proxies, relative_dir="/"):
        '''
        Runs everything for twitter

        Parameters:
        ___________

        keywords:

        historicList:

        proxies:

        
        relative_dir:

        '''
        _, currRoot_dir = get_locations()
        self.relative_dir = relative_dir
        self.currDir = os.path.join(currRoot_dir, relative_dir)
        self.historic_path = os.path.join(self.currDir,
                                          "data/tweet/{}/historic_scrape")

        self.logger = get_logger(self.currDir + '/logs/live.txt')
        self.coins = [key for key, value in keywords.items()]
        self.historicList = historicList
        self.proxies = proxies
 def setup_method(self):
     _, historicList = get_keywords()
     historicList = [historicList[0]]
     self.hp = historicProcessor(historicList,
                                 "initial_algo",
                                 relative_dir="twitterscraper/tests")
     _, self.curr_Root = get_locations()
Beispiel #4
0
    def __init__(self, proxy=None, logger=None):
        '''
        Parameters:
        ___________
        logger: (logger)
        logger object to log all this

        proxy: (dict)
        Single Proxy to use or None. Dictionary containing http, https and ftp proxy to use for using with requests
        '''

        self.HEADERS_LIST = [
            'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.79 Safari/537.36',
            'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:60.0) Gecko/20100101 Firefox/60.0',
            'Mozilla/5.0 (X11; Linux x86_64; rv:58.0) Gecko/20100101 Firefox/58.0'
        ]

        self.INIT_URL = "https://twitter.com/search?vertical=tweets&vertical=default&q={q}&l={lang}"
        self.RELOAD_URL = "https://twitter.com/i/search/timeline?vertical=" \
                    "default&include_available_features=1&include_entities=1&" \
                    "reset_error_state=false&src=typd&max_position={pos}&q={q}&l={lang}"
        _, self.currRoot = get_locations()

        if logger == None:
            self.logger = get_logger(self.currRoot +
                                     "/logs/twitterscraper.log")
        else:
            self.logger = logger

        self.proxy = proxy
Beispiel #5
0
    def __init__(self,
                 detailsList,
                 proxies=None,
                 logger=None,
                 relative_dir=""):
        '''
        Parameters:
        ___________
        detailsList (list): 
        List containing keyword, coinname in string and start and end in date format
        
        proxies (list of dict or None):
        list of dict in proxies format (containing http, https and ftp) to use for each next query. Else None to not use

        logger (logger):
        Saves to file if not provided else default
        '''

        _, self.currRoot_dir = get_locations()

        if logger == None:
            self.logger = get_logger(
                os.path.join(self.currRoot_dir + "logs/twitterscraper.log"))
        else:
            self.logger = logger

        self.detailsList = detailsList
        self.relative_dir = relative_dir
        self.proxies = proxies
Beispiel #6
0
    def __init__(self, keywords, logger=None, tweetCount=0):
        '''
        Parameters:
        ___________
        keywords: (dict)
        Dictionary containing coinname and its relevant keywords
        Example:
        {'bitcoin': ['bitcoin', 'BTC'], 'dashcoin': ['dashcoin', 'DASH', 'darkcoin']}

        tweetCount (int) (optional):
        If not set to 0, the program will terminate after n tweets is found
        '''

        _, self.currRoot_dir = get_locations()

        if (logger == None):
            self.logger = get_logger(self.currRoot_dir + '/logs/live.txt')
        else:
            self.logger = logger

        self.api = API()
        self.df = pd.DataFrame(columns=[
            'ID', 'Tweet', 'Time', 'User', 'Likes', 'Replies', 'Retweets',
            'in_response_to', 'response_type', 'coinname'
        ])
        self.userData = pd.DataFrame(columns=[
            'username', 'created', 'location', 'has_location', 'is_verified',
            'total_tweets', 'total_following', 'total_followers',
            'total_likes', 'has_avatar', 'has_background', 'is_protected',
            'profile_modified'
        ])
        self.keywords = keywords
        self.start_time = int(time.time())
        self.tweetCount = tweetCount
        self.statusCount = 0
Beispiel #7
0
    def __init__(self, keywords, logger=None, tweetCount=0):
        """
        Parameters:
        ___________
        keywords (dictionary):
        Dictionary containing coinname and its relevant keywords
        Example:
        {'bitcoin': ['bitcoin', 'BTC'], 'dashcoin': ['dashcoin', 'DASH', 'darkcoin']}

        tweetCount (int) (optional):
        If not set to 0, the program will terminate after n tweets is found
        """
        #Relative directory needs to be moved here not in function below. And replace all with os.path.join
        _, self.currRoot_dir = get_locations()
        self.tweetCount = tweetCount

        self.keywords = keywords
        self.coins = [key for key, value in keywords.items()]
        self.keywordsOnly = [
            value for key, values in keywords.items() for value in values
        ]

        if (logger == None):
            self.logger = get_logger(self.currRoot_dir + '/logs/live.log')
        else:
            self.logger = logger

        runUtils(self.keywords).create_directory_structure()
Beispiel #8
0
    def __init__(self, proxy=None, logger=None):
        self.HEADERS_LIST = ['Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.79 Safari/537.36', 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:60.0) Gecko/20100101 Firefox/60.0', 'Mozilla/5.0 (X11; Linux x86_64; rv:58.0) Gecko/20100101 Firefox/58.0']
        self.proxy = proxy
        _, self.currRoot = get_locations()

        if logger == None:
            self.logger = get_logger(os.path.join(self.currRoot, "logs/profilescraper.log"))
        else:
            self.logger = logger
Beispiel #9
0
    def __init__(self, keywords, logger=None):
        self.keywords = keywords
        self.coins = [key for key, value in keywords.items()]
        _, self.currRoot_dir = get_locations()

        if (logger == None):
            self.logger = get_logger(self.currRoot_dir + '/logs/run_utils.txt')
        else:
            self.logger = logger
def test_get_logger():
    name, rootDir = get_locations()
    flocation = rootDir + "/libs/tests/test.log"
    logger = get_logger(flocation)
    logger.info("abc")

    assert (os.path.exists(flocation))

    with open(flocation, 'r') as f:
        assert (f.readlines()[0] == 'abc\n')

    os.remove(flocation)

    assert (not (os.path.exists(flocation)))
Beispiel #11
0
    def __init__(self, detailsList, relative_dir=""):
        '''
        Parameters:
        ___________
        detailsList (list): 
        List containing keyword, coinname in string and start and end in date format. For looping

        relative_dir (string):
        The relative directory 
        '''
        _, currRoot_dir = get_locations()

        self.logger = get_logger(os.path.join(currRoot_dir, "logs/profileprocess.log"))
        self.detailsList = detailsList
        self.profile_path = os.path.join(currRoot_dir, relative_dir, "data/profile")
Beispiel #12
0
    def __init__(self, profiles, proxies=None, relative_dir=""):
        '''
        Functions to call to save historic profiles

        Parameters:
        __________
        profiles (Pandas Series or list or numpy):
        Series of profile name
        '''

        self.profiles = profiles
        _, self.currRoot = get_locations()

        self.logger = get_logger(os.path.join(self.currRoot, "logs/profilescraper.log"))
        
        self.path = os.path.join(self.currRoot, relative_dir, "data/profile")
        self.proxies = proxies
Beispiel #13
0
    def __init__(self, detailsList, algo_name, relative_dir=""):
        '''
        Parameters:
        ___________
        detailsList (list): 
        List containing keyword, coinname in string and start and end in date format. For looping

        algo_name (string):
        The name of the algorithm so as to save in file

        relative_dir (string):
        The relative directory 
        '''
        _, currRoot_dir = get_locations()

        self.logger = get_logger(os.path.join(currRoot_dir, "/logs/historicprocess.log"))

        self.detailsList = detailsList
        self.algo_name = algo_name

        self.historic_path = os.path.join(currRoot_dir, relative_dir, "data/tweet/{}/historic_scrape")
Beispiel #14
0
    def setup_method(self):
        self.keywords = {
            'bitcoin': ['bitcoin', 'BTC'],
            'dashcoin': ['dashcoin', 'DASH', 'darkcoin'],
            'dogecoin': ['dogecoin', 'DOGE'],
            'ethereum': ['ethereum', 'ETH'],
            'litecoin': ['litecoin', 'LTC'],
            'ripple': ['ripple', 'XRP'],
            'monero': ['monero', 'XMR'],
            'stellar': ['stellar', 'STR']
        }
        self.keywordsOnly = [
            value for key, values in self.keywords.items() for value in values
        ]

        _, self.currRoot_dir = get_locations()

        self.logger = get_logger(self.currRoot_dir +
                                 '/livescraper/tests/live.log')

        self.qt = query_live_tweets(self.keywords, tweetCount=10)
        listener, auth = self.qt.get_listener(create=True)
        self.qt.perform_search()
        self.df, self.userData, _ = listener.get_data()
Beispiel #15
0
            try:
                shutil.rmtree("{}/data/tweet/{}".format(
                    self.currDir, coinname))
            except:
                pass

            self.logger.info("Removing {}/data/profile/{}".format(
                self.currDir, coinname))

            try:
                shutil.rmtree("{}/data/profile".format(self.currDir, coinname))
            except:
                pass


_, currRoot_dir = get_locations()
logger = get_logger(currRoot_dir + '/logs/run_live.txt')

liveKeywords, historicList = get_keywords()
proxies = get_proxies()

ra = runAll(liveKeywords, historicList, proxies=proxies)
ra.initial_houskeeping(clean=False)  #change when required

t1 = threading.Thread(target=download_live, args=[liveKeywords, logger])
t1.start()

logger2 = get_logger(currRoot_dir + '/logs/run_live_2.txt')

#house folder contains kunai ma nabhako data
while True:
Beispiel #16
0
import sys

import pandas as pd
import numpy as np

import swifter
import numba
from multiprocessing import Pool, cpu_count

import nltk
from nltk.sentiment import vader

from libs.writing_utils import get_locations, get_logger
from libs.reading_utils import cleanData

sys.path.append(os.path.dirname(get_locations()[1]))

from common_modules.common_utils import merge_csvs, trends_ta

def applyVader(x, analyzer):
    return analyzer.polarity_scores(x)['compound']

def applyParallel(dfGrouped, func):
    with Pool(cpu_count()) as p:
        ret_list = p.map(func, [group for name, group in dfGrouped])
        
    return pd.DataFrame(ret_list)

class historicProcessor:
    '''
    Processor for historic data
def test_get_locations():
    name, rootDir = get_locations("twitter_data")
    assert (rootDir.split("twitter_data")[0] in __file__)